import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
Load Data¶
df_train = pd.DataFrame(pd.read_csv('/content/drive/MyDrive/DL_Project/trainv1.csv'))
df_test = pd.DataFrame(pd.read_csv('/content/drive/MyDrive/DL_Project/testv1.csv'))
df_train
<ipython-input-4-cf622a5b4f5d>:1: DtypeWarning: Columns (26) have mixed types. Specify dtype option on import or set low_memory=False.
df_train = pd.DataFrame(pd.read_csv('/content/drive/MyDrive/DL_Project/trainv1.csv'))
| ID | Customer_ID | Month | Name | Age | SSN | Occupation | Annual_Income | Monthly_Inhand_Salary | Num_Bank_Accounts | ... | Credit_Mix | Outstanding_Debt | Credit_Utilization_Ratio | Credit_History_Age | Payment_of_Min_Amount | Total_EMI_per_month | Amount_invested_monthly | Payment_Behaviour | Monthly_Balance | Credit_Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0x1602 | CUS_0xd40 | January | Aaron Maashoh | 23 | 821-00-0265 | Scientist | 19114.12 | 1824.843333 | 3 | ... | _ | 809.98 | 26.822620 | 22 Years and 1 Months | No | 49.574949 | 80.41529543900253 | High_spent_Small_value_payments | 312.49408867943663 | Good |
| 1 | 0x1603 | CUS_0xd40 | February | Aaron Maashoh | 23 | 821-00-0265 | Scientist | 19114.12 | NaN | 3 | ... | Good | 809.98 | 31.944960 | NaN | No | 49.574949 | 118.28022162236736 | Low_spent_Large_value_payments | 284.62916249607184 | Good |
| 2 | 0x1604 | CUS_0xd40 | March | Aaron Maashoh | -500 | 821-00-0265 | Scientist | 19114.12 | NaN | 3 | ... | Good | 809.98 | 28.609352 | 22 Years and 3 Months | No | 49.574949 | 81.699521264648 | Low_spent_Medium_value_payments | 331.2098628537912 | Good |
| 3 | 0x1605 | CUS_0xd40 | April | Aaron Maashoh | 23 | 821-00-0265 | Scientist | 19114.12 | NaN | 3 | ... | Good | 809.98 | 31.377862 | 22 Years and 4 Months | No | 49.574949 | 199.4580743910713 | Low_spent_Small_value_payments | 223.45130972736786 | Good |
| 4 | 0x1606 | CUS_0xd40 | May | Aaron Maashoh | 23 | 821-00-0265 | Scientist | 19114.12 | 1824.843333 | 3 | ... | Good | 809.98 | 24.797347 | 22 Years and 5 Months | No | 49.574949 | 41.420153086217326 | High_spent_Medium_value_payments | 341.48923103222177 | Good |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99995 | 0x25fe9 | CUS_0x942c | April | Nicks | 25 | 078-73-5990 | Mechanic | 39628.99 | 3359.415833 | 4 | ... | _ | 502.38 | 34.663572 | 31 Years and 6 Months | No | 35.104023 | 60.97133255718485 | High_spent_Large_value_payments | 479.866228 | Poor |
| 99996 | 0x25fea | CUS_0x942c | May | Nicks | 25 | 078-73-5990 | Mechanic | 39628.99 | 3359.415833 | 4 | ... | _ | 502.38 | 40.565631 | 31 Years and 7 Months | No | 35.104023 | 54.18595028760385 | High_spent_Medium_value_payments | 496.65161 | Poor |
| 99997 | 0x25feb | CUS_0x942c | June | Nicks | 25 | 078-73-5990 | Mechanic | 39628.99 | 3359.415833 | 4 | ... | Good | 502.38 | 41.255522 | 31 Years and 8 Months | No | 35.104023 | 24.02847744864441 | High_spent_Large_value_payments | 516.809083 | Poor |
| 99998 | 0x25fec | CUS_0x942c | July | Nicks | 25 | 078-73-5990 | Mechanic | 39628.99 | 3359.415833 | 4 | ... | Good | 502.38 | 33.638208 | 31 Years and 9 Months | No | 35.104023 | 251.67258219721603 | Low_spent_Large_value_payments | 319.164979 | Standard |
| 99999 | 0x25fed | CUS_0x942c | August | Nicks | 25 | 078-73-5990 | Mechanic | 39628.99_ | 3359.415833 | 4 | ... | Good | 502.38 | 34.192463 | 31 Years and 10 Months | No | 35.104023 | 167.1638651610451 | !@9#%8 | 393.673696 | Poor |
100000 rows × 28 columns
df_test
| ID | Customer_ID | Month | Name | Age | SSN | Occupation | Annual_Income | Monthly_Inhand_Salary | Num_Bank_Accounts | ... | Num_Credit_Inquiries | Credit_Mix | Outstanding_Debt | Credit_Utilization_Ratio | Credit_History_Age | Payment_of_Min_Amount | Total_EMI_per_month | Amount_invested_monthly | Payment_Behaviour | Monthly_Balance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0x160a | CUS_0xd40 | September | Aaron Maashoh | 23 | 821-00-0265 | Scientist | 19114.12 | 1824.843333 | 3 | ... | 2022.0 | Good | 809.98 | 35.030402 | 22 Years and 9 Months | No | 49.574949 | 236.64268203272135 | Low_spent_Small_value_payments | 186.26670208571772 |
| 1 | 0x160b | CUS_0xd40 | October | Aaron Maashoh | 24 | 821-00-0265 | Scientist | 19114.12 | 1824.843333 | 3 | ... | 4.0 | Good | 809.98 | 33.053114 | 22 Years and 10 Months | No | 49.574949 | 21.465380264657146 | High_spent_Medium_value_payments | 361.44400385378196 |
| 2 | 0x160c | CUS_0xd40 | November | Aaron Maashoh | 24 | 821-00-0265 | Scientist | 19114.12 | 1824.843333 | 3 | ... | 4.0 | Good | 809.98 | 33.811894 | NaN | No | 49.574949 | 148.23393788500925 | Low_spent_Medium_value_payments | 264.67544623342997 |
| 3 | 0x160d | CUS_0xd40 | December | Aaron Maashoh | 24_ | 821-00-0265 | Scientist | 19114.12 | NaN | 3 | ... | 4.0 | Good | 809.98 | 32.430559 | 23 Years and 0 Months | No | 49.574949 | 39.08251089460281 | High_spent_Medium_value_payments | 343.82687322383634 |
| 4 | 0x1616 | CUS_0x21b1 | September | Rick Rothackerj | 28 | 004-07-5839 | _______ | 34847.84 | 3037.986667 | 2 | ... | 5.0 | Good | 605.03 | 25.926822 | 27 Years and 3 Months | No | 18.816215 | 39.684018417945296 | High_spent_Large_value_payments | 485.2984336755923 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 49995 | 0x25fe5 | CUS_0x8600 | December | Sarah McBridec | 4975 | 031-35-0942 | Architect | 20002.88 | 1929.906667 | 10 | ... | 12.0 | _ | 3571.7 | 34.780553 | NaN | Yes | 60.964772 | 146.48632477751087 | Low_spent_Small_value_payments | 275.53956951573343 |
| 49996 | 0x25fee | CUS_0x942c | September | Nicks | 25 | 078-73-5990 | Mechanic | 39628.99 | NaN | 4 | ... | 7.0 | Good | 502.38 | 27.758522 | 31 Years and 11 Months | NM | 35.104023 | 181.44299902757518 | Low_spent_Small_value_payments | 409.39456169535066 |
| 49997 | 0x25fef | CUS_0x942c | October | Nicks | 25 | 078-73-5990 | Mechanic | 39628.99 | 3359.415833 | 4 | ... | 7.0 | Good | 502.38 | 36.858542 | 32 Years and 0 Months | No | 35.104023 | __10000__ | Low_spent_Large_value_payments | 349.7263321025098 |
| 49998 | 0x25ff0 | CUS_0x942c | November | Nicks | 25 | 078-73-5990 | Mechanic | 39628.99 | NaN | 4 | ... | 7.0 | Good | 502.38 | 39.139840 | 32 Years and 1 Months | No | 35.104023 | 97.59857973344877 | High_spent_Small_value_payments | 463.23898098947717 |
| 49999 | 0x25ff1 | CUS_0x942c | December | Nicks | 25 | 078-73-5990 | Mechanic | 39628.99 | 3359.415833 | 4 | ... | 7.0 | _ | 502.38 | 34.108530 | 32 Years and 2 Months | No | 35.104023 | 220.45787812168732 | Low_spent_Medium_value_payments | 360.37968260123847 |
50000 rows × 27 columns
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100000 entries, 0 to 99999 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 100000 non-null object 1 Customer_ID 100000 non-null object 2 Month 100000 non-null object 3 Name 90015 non-null object 4 Age 100000 non-null object 5 SSN 100000 non-null object 6 Occupation 100000 non-null object 7 Annual_Income 100000 non-null object 8 Monthly_Inhand_Salary 84998 non-null float64 9 Num_Bank_Accounts 100000 non-null int64 10 Num_Credit_Card 100000 non-null int64 11 Interest_Rate 100000 non-null int64 12 Num_of_Loan 100000 non-null object 13 Type_of_Loan 88592 non-null object 14 Delay_from_due_date 100000 non-null int64 15 Num_of_Delayed_Payment 92998 non-null object 16 Changed_Credit_Limit 100000 non-null object 17 Num_Credit_Inquiries 98035 non-null float64 18 Credit_Mix 100000 non-null object 19 Outstanding_Debt 100000 non-null object 20 Credit_Utilization_Ratio 100000 non-null float64 21 Credit_History_Age 90970 non-null object 22 Payment_of_Min_Amount 100000 non-null object 23 Total_EMI_per_month 100000 non-null float64 24 Amount_invested_monthly 95521 non-null object 25 Payment_Behaviour 100000 non-null object 26 Monthly_Balance 98800 non-null object 27 Credit_Score 100000 non-null object dtypes: float64(4), int64(4), object(20) memory usage: 21.4+ MB
for col in df_train.columns:
unique_values = df_train[col].unique()
print(f"Column: {col}")
print(f"Unique Values (Top 20): {unique_values[:20]}")
print(f"Total Unique Values: {len(unique_values)}")
print("=" * 20)
Column: ID Unique Values (Top 20): ['0x1602' '0x1603' '0x1604' '0x1605' '0x1606' '0x1607' '0x1608' '0x1609' '0x160e' '0x160f' '0x1610' '0x1611' '0x1612' '0x1613' '0x1614' '0x1615' '0x161a' '0x161b' '0x161c' '0x161d'] Total Unique Values: 100000 ==================== Column: Customer_ID Unique Values (Top 20): ['CUS_0xd40' 'CUS_0x21b1' 'CUS_0x2dbc' 'CUS_0xb891' 'CUS_0x1cdb' 'CUS_0x95ee' 'CUS_0x284a' 'CUS_0x5407' 'CUS_0x4157' 'CUS_0xba08' 'CUS_0xa66b' 'CUS_0xc0ab' 'CUS_0x3e45' 'CUS_0x6c66' 'CUS_0xff4' 'CUS_0x33d2' 'CUS_0x6070' 'CUS_0xfdb' 'CUS_0x3553' 'CUS_0x4100'] Total Unique Values: 12500 ==================== Column: Month Unique Values (Top 20): ['January' 'February' 'March' 'April' 'May' 'June' 'July' 'August'] Total Unique Values: 8 ==================== Column: Name Unique Values (Top 20): ['Aaron Maashoh' nan 'Rick Rothackerj' 'Langep' 'Jasond' 'Deepaa' 'Np' 'Nadiaq' 'Annk' 'Charlie Zhur' 'Jamesj' 'Saphirj' 'Soyoungd' 'Harriet McLeodd' 'Sinead Carews' 'Poornimaf' 'Chalmersa' 'Parkm' 'Patrickg' 'Laurence Frosty'] Total Unique Values: 10140 ==================== Column: Age Unique Values (Top 20): ['23' '-500' '28_' '28' '34' '54' '55' '21' '31' '33' '34_' '7580' '30' '30_' '24' '24_' '44' '45' '40' '41'] Total Unique Values: 1788 ==================== Column: SSN Unique Values (Top 20): ['821-00-0265' '#F%$D@*&8' '004-07-5839' '486-85-3974' '072-31-6145' '615-06-7821' '612-70-8987' '411-51-0676' '500-92-6408' '070-19-1622' '366-68-1681' '221-30-8554' '342-90-2649' '414-53-2918' '328-33-6328' '655-05-7666' '965-46-2491' '891-55-9364' '928-91-4452' '084-25-3745'] Total Unique Values: 12501 ==================== Column: Occupation Unique Values (Top 20): ['Scientist' '_______' 'Teacher' 'Engineer' 'Entrepreneur' 'Developer' 'Lawyer' 'Media_Manager' 'Doctor' 'Journalist' 'Manager' 'Accountant' 'Musician' 'Mechanic' 'Writer' 'Architect'] Total Unique Values: 16 ==================== Column: Annual_Income Unique Values (Top 20): ['19114.12' '34847.84' '34847.84_' '143162.64' '30689.89' '30689.89_' '35547.71_' '35547.71' '73928.46' '131313.4' '10909427.0' '34081.38_' '34081.38' '114838.41' '114838.41_' '31370.8' '33751.27' '88640.24' '88640.24_' '54392.16'] Total Unique Values: 18940 ==================== Column: Monthly_Inhand_Salary Unique Values (Top 20): [ 1824.84333333 nan 3037.98666667 12187.22 2612.49083333 2853.30916667 5988.705 11242.78333333 10469.20775939 2611.115 9843.8675 2825.23333333 2948.60583333 7266.68666667 4766.68 519.12875 2415.855 2942.14833333 7591.59 2898.385 ] Total Unique Values: 13236 ==================== Column: Num_Bank_Accounts Unique Values (Top 20): [ 3 2 1 7 4 0 8 5 6 9 10 1414 1231 67 572 1488 91 528 1647 1696] Total Unique Values: 943 ==================== Column: Num_Credit_Card Unique Values (Top 20): [ 4 1385 5 1288 1 7 6 1029 488 8 1381 898 3 518 1005 9 1327 1189 2 10] Total Unique Values: 1179 ==================== Column: Interest_Rate Unique Values (Top 20): [ 3 6 8 4 5 5318 15 7 12 20 1 433 14 32 16 17 5240 4975 10 31] Total Unique Values: 1750 ==================== Column: Num_of_Loan Unique Values (Top 20): ['4' '1' '3' '967' '-100' '0' '0_' '2' '3_' '2_' '7' '5' '5_' '6' '8' '8_' '9' '9_' '4_' '7_'] Total Unique Values: 434 ==================== Column: Type_of_Loan Unique Values (Top 20): ['Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan' 'Credit-Builder Loan' 'Auto Loan, Auto Loan, and Not Specified' 'Not Specified' nan 'Credit-Builder Loan, and Mortgage Loan' 'Not Specified, Auto Loan, and Student Loan' 'Personal Loan, Debt Consolidation Loan, and Auto Loan' 'Not Specified, and Payday Loan' 'Credit-Builder Loan, Personal Loan, and Auto Loan' 'Payday Loan, and Payday Loan' 'Not Specified, Student Loan, and Personal Loan' 'Personal Loan, Payday Loan, Student Loan, Auto Loan, Home Equity Loan, Student Loan, and Payday Loan' 'Not Specified, Student Loan, Student Loan, Credit-Builder Loan, and Auto Loan' 'Payday Loan, and Home Equity Loan' 'Credit-Builder Loan, Not Specified, Mortgage Loan, Payday Loan, Credit-Builder Loan, and Personal Loan' 'Mortgage Loan, Debt Consolidation Loan, Payday Loan, Auto Loan, and Not Specified' 'Credit-Builder Loan, Mortgage Loan, Mortgage Loan, Credit-Builder Loan, and Student Loan' 'Not Specified, Student Loan, and Student Loan' 'Payday Loan, Not Specified, Credit-Builder Loan, Debt Consolidation Loan, Payday Loan, Not Specified, Student Loan, and Student Loan'] Total Unique Values: 6261 ==================== Column: Delay_from_due_date Unique Values (Top 20): [ 3 -1 5 6 8 7 13 10 0 4 9 1 12 11 30 31 34 27 14 2] Total Unique Values: 73 ==================== Column: Num_of_Delayed_Payment Unique Values (Top 20): ['7' nan '4' '8_' '6' '1' '-1' '3_' '0' '8' '5' '3' '9' '12' '15' '17' '10' '2' '2_' '11'] Total Unique Values: 750 ==================== Column: Changed_Credit_Limit Unique Values (Top 20): ['11.27' '_' '6.27' '9.27' '5.42' '7.42' '6.42' '7.1' '11.1' '9.1' '1.99' '-2.01' '-1.01' '-3.01' '2.58' '10.14' '9.14' '9.34' '15.34' '8.34'] Total Unique Values: 4384 ==================== Column: Num_Credit_Inquiries Unique Values (Top 20): [4.000e+00 2.000e+00 3.000e+00 nan 5.000e+00 9.000e+00 8.000e+00 7.000e+00 6.000e+00 0.000e+00 1.000e+00 1.000e+01 1.050e+03 1.100e+01 1.200e+01 1.044e+03 1.700e+01 1.936e+03 1.300e+01 5.680e+02] Total Unique Values: 1224 ==================== Column: Credit_Mix Unique Values (Top 20): ['_' 'Good' 'Standard' 'Bad'] Total Unique Values: 4 ==================== Column: Outstanding_Debt Unique Values (Top 20): ['809.98' '605.03' '1303.01' '632.46' '943.86' '548.2' '352.16' '1704.18' '1377.74' '421.43' '1328.93' '1328.93_' '950.36' '179.22' '2602.69' '758.44' '818.22' '1296.64' '1283.37' '1283.37_'] Total Unique Values: 13178 ==================== Column: Credit_Utilization_Ratio Unique Values (Top 20): [26.82261962 31.94496006 28.60935202 31.37786187 24.79734691 27.26225871 22.53759303 23.9337948 24.46403064 38.55084843 33.22495079 39.18265566 34.97789475 33.3810102 31.13170161 32.93385629 28.61673482 41.70257342 26.51981539 39.50164811] Total Unique Values: 100000 ==================== Column: Credit_History_Age Unique Values (Top 20): ['22 Years and 1 Months' nan '22 Years and 3 Months' '22 Years and 4 Months' '22 Years and 5 Months' '22 Years and 6 Months' '22 Years and 7 Months' '26 Years and 7 Months' '26 Years and 8 Months' '26 Years and 9 Months' '26 Years and 10 Months' '26 Years and 11 Months' '27 Years and 0 Months' '27 Years and 1 Months' '27 Years and 2 Months' '17 Years and 9 Months' '17 Years and 10 Months' '17 Years and 11 Months' '18 Years and 1 Months' '18 Years and 2 Months'] Total Unique Values: 405 ==================== Column: Payment_of_Min_Amount Unique Values (Top 20): ['No' 'NM' 'Yes'] Total Unique Values: 3 ==================== Column: Total_EMI_per_month Unique Values (Top 20): [4.95749492e+01 1.88162146e+01 2.46992319e+02 1.64154517e+01 0.00000000e+00 1.50150000e+04 1.55150000e+04 1.37644605e+02 9.11220179e+02 2.38340000e+04 3.26620000e+04 7.04783327e+01 2.26892792e+02 4.66161291e+01 1.64150000e+04 6.50081743e+01 1.35173371e+02 8.03570000e+04 1.24392082e+02 3.65481972e+01] Total Unique Values: 14950 ==================== Column: Amount_invested_monthly Unique Values (Top 20): ['80.41529543900253' '118.28022162236736' '81.699521264648' '199.4580743910713' '41.420153086217326' '62.430172331195294' '178.3440674122349' '24.785216509052056' '104.291825168246' '40.39123782853101' '58.51597569589465' '99.30622796053305' '130.11542024292334' '43.477190144355745' '70.10177420755677' '218.90434353388733' '168.413702679309' '232.86038375993544' '__10000__' '825.2162699393922'] Total Unique Values: 91050 ==================== Column: Payment_Behaviour Unique Values (Top 20): ['High_spent_Small_value_payments' 'Low_spent_Large_value_payments' 'Low_spent_Medium_value_payments' 'Low_spent_Small_value_payments' 'High_spent_Medium_value_payments' '!@9#%8' 'High_spent_Large_value_payments'] Total Unique Values: 7 ==================== Column: Monthly_Balance Unique Values (Top 20): ['312.49408867943663' '284.62916249607184' '331.2098628537912' '223.45130972736786' '341.48923103222177' '340.4792117872438' '244.5653167062043' '358.12416760938714' '470.69062692529184' '484.5912142650067' '466.46647639764313' '465.6762241330048' '444.8670318506144' '481.505261949182' '464.8806778859809' '356.07810855965045' '1043.3159778669492' '998.8692967863226' '715.741367403555' '426.5134106068658'] Total Unique Values: 98793 ==================== Column: Credit_Score Unique Values (Top 20): ['Good' 'Standard' 'Poor'] Total Unique Values: 3 ====================
df_train.isnull().sum()
| 0 | |
|---|---|
| ID | 0 |
| Customer_ID | 0 |
| Month | 0 |
| Name | 9985 |
| Age | 0 |
| SSN | 0 |
| Occupation | 0 |
| Annual_Income | 0 |
| Monthly_Inhand_Salary | 15002 |
| Num_Bank_Accounts | 0 |
| Num_Credit_Card | 0 |
| Interest_Rate | 0 |
| Num_of_Loan | 0 |
| Type_of_Loan | 11408 |
| Delay_from_due_date | 0 |
| Num_of_Delayed_Payment | 7002 |
| Changed_Credit_Limit | 0 |
| Num_Credit_Inquiries | 1965 |
| Credit_Mix | 0 |
| Outstanding_Debt | 0 |
| Credit_Utilization_Ratio | 0 |
| Credit_History_Age | 9030 |
| Payment_of_Min_Amount | 0 |
| Total_EMI_per_month | 0 |
| Amount_invested_monthly | 4479 |
| Payment_Behaviour | 0 |
| Monthly_Balance | 1200 |
| Credit_Score | 0 |
df_test.isnull().sum()
| 0 | |
|---|---|
| ID | 0 |
| Customer_ID | 0 |
| Month | 0 |
| Name | 5015 |
| Age | 0 |
| SSN | 0 |
| Occupation | 0 |
| Annual_Income | 0 |
| Monthly_Inhand_Salary | 7498 |
| Num_Bank_Accounts | 0 |
| Num_Credit_Card | 0 |
| Interest_Rate | 0 |
| Num_of_Loan | 0 |
| Type_of_Loan | 5704 |
| Delay_from_due_date | 0 |
| Num_of_Delayed_Payment | 3498 |
| Changed_Credit_Limit | 0 |
| Num_Credit_Inquiries | 1035 |
| Credit_Mix | 0 |
| Outstanding_Debt | 0 |
| Credit_Utilization_Ratio | 0 |
| Credit_History_Age | 4470 |
| Payment_of_Min_Amount | 0 |
| Total_EMI_per_month | 0 |
| Amount_invested_monthly | 2271 |
| Payment_Behaviour | 0 |
| Monthly_Balance | 562 |
Exploratory Data Analysis¶
df_train['is_train'] = 1
df_test['is_train'] = 0
# Combining the datasets into one DataFrame
df_group = pd.concat([df_train, df_test], ignore_index=True)
Reading & Cleaning¶
df_group.describe(include="all").T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | 150000 | 150000 | 0x1602 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Customer_ID | 150000 | 12500 | CUS_0xd40 | 12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Month | 150000 | 12 | January | 12500 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Name | 135000 | 10139 | Stevex | 66 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Age | 150000 | 2524 | 39 | 4198 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| SSN | 150000 | 12501 | #F%$D@*&8 | 8400 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Occupation | 150000 | 16 | _______ | 10500 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Annual_Income | 150000 | 21192 | 36585.12 | 24 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Monthly_Inhand_Salary | 127500.0 | NaN | NaN | NaN | 4190.115139 | 3180.489657 | 303.645417 | 1625.265833 | 3091.0 | 5948.454596 | 15204.633333 |
| Num_Bank_Accounts | 150000.0 | NaN | NaN | NaN | 17.00694 | 117.069476 | -1.0 | 3.0 | 6.0 | 7.0 | 1798.0 |
| Num_Credit_Card | 150000.0 | NaN | NaN | NaN | 22.623447 | 129.143006 | 0.0 | 4.0 | 5.0 | 7.0 | 1499.0 |
| Interest_Rate | 150000.0 | NaN | NaN | NaN | 71.234907 | 461.537193 | 1.0 | 8.0 | 13.0 | 20.0 | 5799.0 |
| Num_of_Loan | 150000 | 623 | 3 | 21500 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Type_of_Loan | 132888 | 6260 | Not Specified | 2112 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Delay_from_due_date | 150000.0 | NaN | NaN | NaN | 21.0634 | 14.860154 | -5.0 | 10.0 | 18.0 | 28.0 | 67.0 |
| Num_of_Delayed_Payment | 139500 | 1058 | 19 | 7949 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Changed_Credit_Limit | 150000 | 4605 | _ | 3150 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Num_Credit_Inquiries | 147000.0 | NaN | NaN | NaN | 28.529014 | 194.456058 | 0.0 | 3.0 | 6.0 | 9.0 | 2597.0 |
| Credit_Mix | 150000 | 4 | Standard | 54858 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Outstanding_Debt | 150000 | 13622 | 1360.45 | 36 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Credit_Utilization_Ratio | 150000.0 | NaN | NaN | NaN | 32.283309 | 5.113315 | 20.0 | 28.054731 | 32.297058 | 36.487954 | 50.0 |
| Credit_History_Age | 136500 | 408 | 17 Years and 11 Months | 628 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Payment_of_Min_Amount | 150000 | 3 | Yes | 78484 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Total_EMI_per_month | 150000.0 | NaN | NaN | NaN | 1432.513579 | 8403.759977 | 0.0 | 30.947775 | 71.280006 | 166.279555 | 82398.0 |
| Amount_invested_monthly | 143250 | 136497 | __10000__ | 6480 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Payment_Behaviour | 150000 | 7 | Low_spent_Small_value_payments | 38207 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Monthly_Balance | 148238 | 148224 | __-333333333333333333333333333__ | 15 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Credit_Score | 100000 | 3 | Standard | 53174 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| is_train | 150000.0 | NaN | NaN | NaN | 0.666667 | 0.471406 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 |
df_group.isnull().any()
| 0 | |
|---|---|
| ID | False |
| Customer_ID | False |
| Month | False |
| Name | True |
| Age | False |
| SSN | False |
| Occupation | False |
| Annual_Income | False |
| Monthly_Inhand_Salary | True |
| Num_Bank_Accounts | False |
| Num_Credit_Card | False |
| Interest_Rate | False |
| Num_of_Loan | False |
| Type_of_Loan | True |
| Delay_from_due_date | False |
| Num_of_Delayed_Payment | True |
| Changed_Credit_Limit | False |
| Num_Credit_Inquiries | True |
| Credit_Mix | False |
| Outstanding_Debt | False |
| Credit_Utilization_Ratio | False |
| Credit_History_Age | True |
| Payment_of_Min_Amount | False |
| Total_EMI_per_month | False |
| Amount_invested_monthly | True |
| Payment_Behaviour | False |
| Monthly_Balance | True |
| Credit_Score | True |
| is_train | False |
credit_score_counts = df_group['Credit_Score'].value_counts()
colors = ['#FF9999', '#66B2FF', '#99FF99', '#FFCC99', '#C299FF']
plt.figure(figsize=(10, 8))
plt.bar(credit_score_counts.index, credit_score_counts.values, color=colors[:len(credit_score_counts)], edgecolor='black')
plt.title('Distribution of Target Variable: Credit_Score', fontsize=14)
plt.xlabel('Credit_Score', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
The chart shows the distribution of credit scores, with most individuals falling into the Standard category. The Poor category follows, while the Good category has the fewest individuals.
colors = ['#FF5733', '#33FFBD', '#337BFF', '#FF33A1', '#A133FF']
plt.figure(figsize=(8, 6))
plt.pie(credit_score_counts.values, labels=credit_score_counts.index, colors=colors, autopct='%1.1f%%', startangle=90, pctdistance=0.85)
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
plt.gca().add_artist(centre_circle)
plt.title('Distribution of Target Variable: Credit_Score', fontsize=14)
plt.show()
The pie chart shows that 53.2% of individuals fall into the Standard category, 29.0% into Poor, and 17.8% into Good.
Business Implications:¶
- Focus on improving the creditworthiness of the
Poorsegment with tailored strategies. - Target the
Goodsegment with premium products or higher credit limits to maximize profits. - Maintain the
Standardgroup with loyalty programs and monitor their credit behaviors.
df_group['ID'].duplicated().sum()
0
df_group['Name'] = df_group.groupby('Customer_ID')['Name'].transform(lambda x: x.fillna(x.mode()[0]))
df_group['Age_Cleaned'] = pd.to_numeric(df_group['Age'].str.extract(r'(\d+)')[0], errors='coerce')
df_group['Age_Cleaned'] = df_group['Age_Cleaned'].fillna(df_group['Age_Cleaned'].median())
df_group['Age_Cleaned'] = df_group['Age_Cleaned'].astype(int)
print(df_group[['Age', 'Age_Cleaned']].head())
Age Age_Cleaned 0 23 23 1 23 23 2 -500 500 3 23 23 4 23 23
import seaborn as sns
import matplotlib.pyplot as plt
# Density Plot
plt.figure(figsize=(10, 5))
sns.kdeplot(df_group['Age_Cleaned'], shade=True, color='blue')
plt.title('Density Plot of Variable: Age', fontsize=14)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.grid(True)
plt.show()
# Violin Plot
plt.figure(figsize=(5, 5))
sns.violinplot(x=df_group['Age_Cleaned'], palette='muted')
plt.title('Violin Plot Distribution of Variable: Age', fontsize=14)
plt.xlabel('Age', fontsize=12)
plt.grid(True)
plt.show()
<ipython-input-18-3038c387d625>:6: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df_group['Age_Cleaned'], shade=True, color='blue')
<ipython-input-18-3038c387d625>:15: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.violinplot(x=df_group['Age_Cleaned'], palette='muted')
import re
def is_valid_ssn(ssn):
pattern = r'^\d{3}-\d{2}-\d{4}$'
return bool(re.match(pattern, ssn))
df_group['Is_Valid_SSN'] = df_group['SSN'].apply(is_valid_ssn)
valid_count = df_group['Is_Valid_SSN'].value_counts()
labels = ['Valid', 'Invalid']
colors = ['#66CC66', '#FF6666']
plt.figure(figsize=(6, 6))
plt.pie(valid_count, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors, wedgeprops={'edgecolor': 'black'})
plt.title(' SSN Valid vs Invalid')
plt.show()
The chart shows 94.4% valid SSNs and 5.6% invalid SSNs.
Business Implications:¶
- Automate invalid SSN detection to improve efficiency and data integrity.
- Investigate invalid SSNs for potential fraud or incomplete records.
- Educate customers on submitting accurate SSNs to reduce errors.
- Ensure valid SSNs to meet regulatory and compliance requirements.
SSN
df_group[df_group['Is_Valid_SSN'] == 0]['SSN']
| SSN | |
|---|---|
| 7 | #F%$D@*&8 |
| 29 | #F%$D@*&8 |
| 51 | #F%$D@*&8 |
| 54 | #F%$D@*&8 |
| 98 | #F%$D@*&8 |
| ... | ... |
| 149937 | #F%$D@*&8 |
| 149950 | #F%$D@*&8 |
| 149955 | #F%$D@*&8 |
| 149968 | #F%$D@*&8 |
| 149973 | #F%$D@*&8 |
8400 rows × 1 columns
df_group['SSN_Cleaned'] = df_group['SSN'].apply(lambda x: x if isinstance(x, str) and len(x.split('-')) == 3 and all(part.isdigit() for part in x.split('-')) and len(x.split('-')[0]) == 3 and len(x.split('-')[1]) == 2 and len(x.split('-')[2]) == 4 else np.nan)
df_group['SSN_Cleaned'] = df_group.groupby('Customer_ID')['SSN_Cleaned'].transform(lambda x: x.ffill().bfill())
df_group['Is_Valid_SSN'] = df_group['SSN_Cleaned'].notnull()
df_group['Is_Valid_SSN'].value_counts()
| count | |
|---|---|
| Is_Valid_SSN | |
| True | 150000 |
plt.figure(figsize=(15, 8))
df_group['Occupation'].value_counts().plot(kind='barh', color='skyblue', edgecolor='black')
plt.title('Distribution of Variable: Occupation', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Occupation', fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
df_group["Occupation"].value_counts()
| count | |
|---|---|
| Occupation | |
| _______ | 10500 |
| Lawyer | 9899 |
| Engineer | 9562 |
| Architect | 9550 |
| Mechanic | 9459 |
| Accountant | 9404 |
| Scientist | 9403 |
| Developer | 9381 |
| Media_Manager | 9362 |
| Teacher | 9318 |
| Entrepreneur | 9277 |
| Journalist | 9122 |
| Doctor | 9114 |
| Manager | 8973 |
| Musician | 8858 |
| Writer | 8818 |
df_group['Occupation'] = df_group['Occupation'].replace('_______', np.nan)
df_group['Occupation'] = df_group.groupby('Customer_ID')['Occupation'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
df_group["Occupation"].value_counts()
<ipython-input-24-9ed0b60522c0>:2: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
df_group['Occupation'] = df_group.groupby('Customer_ID')['Occupation'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
| count | |
|---|---|
| Occupation | |
| Lawyer | 10644 |
| Engineer | 10296 |
| Architect | 10236 |
| Mechanic | 10164 |
| Scientist | 10116 |
| Accountant | 10116 |
| Developer | 10080 |
| Media_Manager | 10080 |
| Teacher | 10008 |
| Entrepreneur | 9972 |
| Doctor | 9852 |
| Journalist | 9804 |
| Manager | 9648 |
| Musician | 9528 |
| Writer | 9456 |
occupation_counts = df_group['Occupation'].value_counts()
plt.figure(figsize=(10, 6))
occupation_counts.plot(kind='barh', color='steelblue', edgecolor='black')
plt.title('Distribution of Variable: Occupation')
plt.xlabel('Count')
plt.ylabel('Occupation')
plt.show()
df_group['Monthly_Inhand_Salary'] = df_group.groupby('Customer_ID')['Monthly_Inhand_Salary'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
df_group['Monthly_Inhand_Salary'].isnull().sum()
<ipython-input-26-1f58bad5e2a4>:1: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
df_group['Monthly_Inhand_Salary'] = df_group.groupby('Customer_ID')['Monthly_Inhand_Salary'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
0
# Histogram
sns.histplot(df_group['Monthly_Inhand_Salary'], kde=True, bins=30, color='purple')
plt.title('Distribution of Variable: Monthly_Inhand_Salary')
plt.xlabel('Monthly_Inhand_Salary')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Monthly_Inhand_Salary'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Monthly_Inhand_Salary')
plt.xlabel('Monthly_Inhand_Salary')
plt.grid(True)
plt.show()
df_group['Annual_Income_Cleaned'] = df_group['Annual_Income'].str.replace('_', '', regex=False).astype(float)
sns.histplot(df_group['Annual_Income_Cleaned'], kde=True, bins=30, color='purple')
plt.title('Distribution of Variable: Annual_Income')
plt.xlabel('Annual_Income')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Annual_Income_Cleaned'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Annual_Income')
plt.xlabel('Annual_Income')
plt.grid(True)
plt.show()
income_ratio = df_group['Annual_Income_Cleaned'] / df_group['Monthly_Inhand_Salary']
print(income_ratio.describe())
count 150000.000000 mean 65.262083 std 801.323485 min 8.089821 25% 11.561407 50% 12.035127 75% 12.573912 max 54110.522117 dtype: float64
The average income ratio indicates that most individuals have annual incomes around 12-13 times their monthly salary, aligning with standard income patterns. Outliers, like extremely high ratios, may suggest errors, bonuses, or irregular income sources. Businesses should focus on accurate income verification and tailor financial products based on realistic income ratios.
Q1 = income_ratio.quantile(0.25)
Q3 = income_ratio.quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR
upper_bound
14.092668813782176
We define a reasonable upper limit for outliers as a scenario where the individual might receive bonuses or other additional income equal to 14 times the monthly salary. Any data point exceeding this threshold can be considered an outlier, as it deviates from the expected annual income range, even when accounting for bonuses and other irregular earnings.
ann_module = df_group.groupby("Customer_ID")["Annual_Income_Cleaned"].agg(
lambda x: x.mode().iloc[0])
df_group["Annual_Income_Cleaned"] = df_group.apply(
lambda row: ann_module[row["Customer_ID"]]
if row["Annual_Income_Cleaned"] > row["Monthly_Inhand_Salary"] * 14
else row["Annual_Income_Cleaned"],
axis=1
)
df_group['Num_of_Loan'].value_counts()
| count | |
|---|---|
| Num_of_Loan | |
| 3 | 21500 |
| 2 | 21423 |
| 4 | 20998 |
| 0 | 15543 |
| 1 | 15112 |
| ... | ... |
| 291 | 1 |
| 365 | 1 |
| 1014 | 1 |
| 1129_ | 1 |
| 1296_ | 1 |
623 rows × 1 columns
df_group['Num_of_Loan_Cleaned'] = df_group['Num_of_Loan'].str.replace(r'\D', '', regex=True).astype(int)
# Density Plot
plt.figure(figsize=(10, 6))
sns.kdeplot(df_group['Num_of_Loan_Cleaned'], fill=True, color='green')
plt.title('Density Plot of Variable: Num_of_Loan', fontsize=14)
plt.xlabel('Num_of_Loan', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.grid(True)
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x=df_group['Num_of_Loan_Cleaned'], color='purple')
plt.title('Violin Plot of Variable: Num_of_Loan', fontsize=14)
plt.xlabel('Num_of_Loan', fontsize=12)
plt.grid(True)
plt.show()
df_group[df_group['Num_of_Loan_Cleaned'] < 100]['Num_of_Loan_Cleaned'].value_counts().head(20).sort_index()
| count | |
|---|---|
| Num_of_Loan_Cleaned | |
| 0 | 16376 |
| 1 | 15901 |
| 2 | 22547 |
| 3 | 22618 |
| 4 | 22111 |
| 5 | 10814 |
| 6 | 11705 |
| 7 | 11024 |
| 8 | 4785 |
| 9 | 5539 |
| 23 | 2 |
| 31 | 2 |
| 33 | 2 |
| 42 | 1 |
| 49 | 2 |
| 50 | 3 |
| 53 | 1 |
| 55 | 2 |
| 58 | 3 |
| 95 | 3 |
The highest value is 9, and any value above this can be considered an anomaly. This threshold helps to identify data points that deviate significantly from the expected range, ensuring a more accurate analysis by flagging outliers for further investigation.
df_group.loc[df_group["Num_of_Loan_Cleaned"] > 9, "Num_of_Loan_Cleaned"] = np.nan
df_group["Num_of_Loan_Cleaned"] = df_group.groupby("Customer_ID")["Num_of_Loan_Cleaned"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill"))
<ipython-input-39-882e7865d870>:4: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. lambda x: x.fillna(method="ffill").fillna(method="bfill"))
sns.histplot(df_group['Num_of_Loan_Cleaned'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Num_of_Loan_Cleaned')
plt.xlabel('Num_of_Loan_Cleaned')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_of_Loan_Cleaned'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_of_Loan_Cleaned')
plt.xlabel('Num_of_Loan_Cleaned')
plt.grid(True)
plt.show()
sns.histplot(df_group['Num_Bank_Accounts'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Num_Bank_Accounts')
plt.xlabel('Num_Bank_Accounts')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Bank_Accounts'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Bank_Accounts')
plt.xlabel('Num_Bank_Accounts')
plt.grid(True)
plt.show()
df_group[df_group['Num_Bank_Accounts'] < 100]['Num_Bank_Accounts'].value_counts().head(20).sort_index()
| count | |
|---|---|
| Num_Bank_Accounts | |
| -1 | 37 |
| 0 | 6494 |
| 1 | 6743 |
| 2 | 6456 |
| 3 | 17905 |
| 4 | 18286 |
| 5 | 18186 |
| 6 | 19505 |
| 7 | 19231 |
| 8 | 19152 |
| 9 | 8181 |
| 10 | 7846 |
| 11 | 28 |
| 28 | 3 |
| 34 | 4 |
| 39 | 3 |
| 43 | 4 |
| 70 | 4 |
| 74 | 5 |
| 79 | 3 |
The highest value is 10, and any value above this can be considered an anomaly. Similarly, any value below 0 is also anomalous, as it falls outside the valid range. These thresholds help to identify outliers both above and below the expected range, ensuring a cleaner and more accurate dataset for analysis.
df_group.loc[(df_group["Num_Bank_Accounts"] < 0) | (df_group["Num_Bank_Accounts"] > 10),
"Num_Bank_Accounts"] = np.nan
df_group["Num_Bank_Accounts"] = df_group.groupby("Customer_ID")["Num_Bank_Accounts"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill"))
<ipython-input-43-eef4c02d9120>:5: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. lambda x: x.fillna(method="ffill").fillna(method="bfill"))
sns.histplot(df_group['Num_Bank_Accounts'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Num_Bank_Accounts')
plt.xlabel('Num_Bank_Accounts')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Bank_Accounts'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Bank_Accounts')
plt.xlabel('Num_Bank_Accounts')
plt.grid(True)
plt.show()
sns.histplot(df_group['Num_Credit_Card'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Num_Credit_Card')
plt.xlabel('Num_Credit_Card')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Credit_Card'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Credit_Card')
plt.xlabel('Num_Credit_Card')
plt.grid(True)
plt.show()
df_group[df_group['Num_Credit_Card'] < 100]['Num_Credit_Card'].value_counts().head(20).sort_index()
| count | |
|---|---|
| Num_Credit_Card | |
| 0 | 29 |
| 1 | 3195 |
| 2 | 3280 |
| 3 | 19816 |
| 4 | 21102 |
| 5 | 27669 |
| 6 | 24802 |
| 7 | 24886 |
| 8 | 7453 |
| 9 | 6976 |
| 10 | 7265 |
| 11 | 77 |
| 25 | 5 |
| 28 | 5 |
| 41 | 5 |
| 51 | 8 |
| 66 | 5 |
| 71 | 5 |
| 77 | 5 |
| 92 | 6 |
df_group.loc[(df_group["Num_Credit_Card"] < 1) | (df_group["Num_Credit_Card"] > 10),
"Num_Credit_Card"] = np.nan
df_group["Num_Credit_Card"] = df_group.groupby("Customer_ID")["Num_Credit_Card"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill"))
<ipython-input-47-5754288aab31>:5: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. lambda x: x.fillna(method="ffill").fillna(method="bfill"))
sns.histplot(df_group['Num_Credit_Card'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Num_Credit_Card')
plt.xlabel('Num_Credit_Card')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Credit_Card'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Credit_Card')
plt.xlabel('Num_Credit_Card')
plt.grid(True)
plt.show()
sns.histplot(df_group['Interest_Rate'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Interest_Rate')
plt.xlabel('Interest_Rate')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Interest_Rate'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Interest_Rate')
plt.xlabel('Interest_Rate')
plt.grid(True)
plt.show()
df_group[df_group['Interest_Rate'] < 20]['Interest_Rate'].value_counts().head(20).sort_index()
| count | |
|---|---|
| Interest_Rate | |
| 1 | 4027 |
| 2 | 3710 |
| 3 | 4153 |
| 4 | 3876 |
| 5 | 7479 |
| 6 | 7089 |
| 7 | 6744 |
| 8 | 7515 |
| 9 | 6747 |
| 10 | 6799 |
| 11 | 6626 |
| 12 | 6828 |
| 13 | 3571 |
| 14 | 3351 |
| 15 | 5984 |
| 16 | 5597 |
| 17 | 5719 |
| 18 | 6154 |
| 19 | 5440 |
filtered_interest_rate = df_group[(df_group['Interest_Rate'] > 20) & (df_group['Interest_Rate'] < 100)]
interest_rate_counts = filtered_interest_rate['Interest_Rate'].value_counts().head(20).sort_index()
print(interest_rate_counts)
Interest_Rate 21 2335 22 2580 23 2530 24 2533 25 2356 26 2238 27 2416 28 2431 29 2495 30 2536 31 2188 32 2616 33 2201 34 2246 59 2 75 2 76 2 81 1 89 2 95 2 Name: count, dtype: int64
The highest value is 34, and any value above this can be considered an anomaly. This threshold identifies outliers that exceed the expected range, ensuring that the data remains consistent and reliable for analysis.
df_group.loc[df_group["Interest_Rate"] > 34,
"Interest_Rate"] = np.nan
df_group["Interest_Rate"] = df_group.groupby("Customer_ID")["Interest_Rate"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill"))
<ipython-input-52-fb42c8ef6ab2>:5: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. lambda x: x.fillna(method="ffill").fillna(method="bfill"))
sns.histplot(df_group['Interest_Rate'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Interest_Rate')
plt.xlabel('Interest_Rate')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Interest_Rate'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Interest_Rate')
plt.xlabel('Interest_Rate')
plt.grid(True)
plt.show()
df_group['Type_of_Loan'].nunique()
print('\n', df_group['Type_of_Loan'].value_counts())
Type_of_Loan
Not Specified 2112
Credit-Builder Loan 1920
Personal Loan 1908
Debt Consolidation Loan 1896
Student Loan 1860
...
Not Specified, Mortgage Loan, Auto Loan, and Payday Loan 12
Payday Loan, Mortgage Loan, Debt Consolidation Loan, and Student Loan 12
Debt Consolidation Loan, Auto Loan, Personal Loan, Debt Consolidation Loan, Student Loan, and Credit-Builder Loan 12
Student Loan, Auto Loan, Student Loan, Credit-Builder Loan, Home Equity Loan, Debt Consolidation Loan, and Debt Consolidation Loan 12
Personal Loan, Auto Loan, Mortgage Loan, Student Loan, and Student Loan 12
Name: count, Length: 6260, dtype: int64
df_group[df_group['Type_of_Loan'].isnull() == True][['Customer_ID', 'Type_of_Loan', 'Num_of_Loan_Cleaned']].head(10)
| Customer_ID | Type_of_Loan | Num_of_Loan_Cleaned | |
|---|---|---|---|
| 32 | CUS_0x1cdb | NaN | 0.0 |
| 33 | CUS_0x1cdb | NaN | 0.0 |
| 34 | CUS_0x1cdb | NaN | 0.0 |
| 35 | CUS_0x1cdb | NaN | 0.0 |
| 36 | CUS_0x1cdb | NaN | 0.0 |
| 37 | CUS_0x1cdb | NaN | 0.0 |
| 38 | CUS_0x1cdb | NaN | 0.0 |
| 39 | CUS_0x1cdb | NaN | 0.0 |
| 40 | CUS_0x95ee | NaN | 0.0 |
| 41 | CUS_0x95ee | NaN | 0.0 |
df_group['Type_of_Loan'].fillna('No Loan', inplace=True)
df_group["Type_of_Loan_Cleaned"] = df_group["Type_of_Loan"].str.replace(" and", "", regex=False)
df_group['Type_of_Loan_Cleaned'] = df_group['Type_of_Loan_Cleaned'].str.split(', ')
unique_loans = set()
for loans in df_group['Type_of_Loan_Cleaned']:
for loan in loans:
loan_items = [item.strip() for item in loan.split(",")]
unique_loans.update(loan_items)
unique_loans = sorted(unique_loans)
unique_loans
<ipython-input-56-12ef8776b8f6>:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df_group['Type_of_Loan'].fillna('No Loan', inplace=True)
['Auto Loan', 'Credit-Builder Loan', 'Debt Consolidation Loan', 'Home Equity Loan', 'Mortgage Loan', 'No Loan', 'Not Specified', 'Payday Loan', 'Personal Loan', 'Student Loan']
df_group['Type_of_Loan_Cleaned'] = df_group['Type_of_Loan_Cleaned'].apply(
lambda x: ', '.join(sorted(set(x)))
)
df_group['Type_of_Loan_Cleaned'].value_counts()
| count | |
|---|---|
| Type_of_Loan_Cleaned | |
| No Loan | 17112 |
| Not Specified | 2340 |
| Credit-Builder Loan | 2232 |
| Personal Loan | 2232 |
| Student Loan | 2208 |
| ... | ... |
| Auto Loan, Credit-Builder Loan, Home Equity Loan, Mortgage Loan, Not Specified, Payday Loan, Personal Loan, Student Loan | 24 |
| Auto Loan, Credit-Builder Loan, Debt Consolidation Loan, Home Equity Loan, Mortgage Loan, Not Specified, Payday Loan, Student Loan | 24 |
| Auto Loan, Debt Consolidation Loan, Home Equity Loan, Mortgage Loan, Payday Loan, Student Loan | 24 |
| Credit-Builder Loan, Debt Consolidation Loan, Home Equity Loan, Mortgage Loan, Not Specified, Payday Loan, Personal Loan, Student Loan | 12 |
| Auto Loan, Credit-Builder Loan, Debt Consolidation Loan, Mortgage Loan, Not Specified, Payday Loan, Personal Loan | 12 |
508 rows × 1 columns
loan_counts = []
for loans in unique_loans:
count_items = len(df_group[df_group['Type_of_Loan_Cleaned'].str.contains(loans, na=False)])
loan_counts.append({'Type': loans, 'Count': count_items})
loan_counts
[{'Type': 'Auto Loan', 'Count': 45840},
{'Type': 'Credit-Builder Loan', 'Count': 47592},
{'Type': 'Debt Consolidation Loan', 'Count': 46560},
{'Type': 'Home Equity Loan', 'Count': 47100},
{'Type': 'Mortgage Loan', 'Count': 47040},
{'Type': 'No Loan', 'Count': 17112},
{'Type': 'Not Specified', 'Count': 47520},
{'Type': 'Payday Loan', 'Count': 47916},
{'Type': 'Personal Loan', 'Count': 46656},
{'Type': 'Student Loan', 'Count': 46560}]
loans = pd.DataFrame(loan_counts)
plt.figure(figsize=(12, 6))
plt.bar(loans['Type'], loans['Count'], color='skyblue', edgecolor='black')
plt.title('Frequency of Loan Types', fontsize=16)
plt.xlabel('Loan Type', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
df_group['Delay_from_due_date'].value_counts()
| count | |
|---|---|
| Delay_from_due_date | |
| 15 | 5355 |
| 13 | 5185 |
| 8 | 5004 |
| 14 | 4949 |
| 10 | 4926 |
| ... | ... |
| 63 | 90 |
| 65 | 86 |
| -5 | 51 |
| 66 | 44 |
| 67 | 29 |
73 rows × 1 columns
negative_delays = df_group[df_group['Delay_from_due_date'] < 0]
print(negative_delays['Delay_from_due_date'].describe())
count 889.000000 mean -2.271091 std 1.221830 min -5.000000 25% -3.000000 50% -2.000000 75% -1.000000 max -1.000000 Name: Delay_from_due_date, dtype: float64
Negative Delay from Due Date indicates early payment, which doesn't need adjustment as the distribution is normal, except for extreme cases like -30 or more. We'll verify data accuracy by checking differences exceeding 30 days between consecutive delay_from_due_date entries for each Customer_ID.
def check_delays_each_costumer(group):
return (group['Delay_from_due_date'].diff().abs() > 30).any()
check = df_group.groupby("Customer_ID").apply(check_delays_each_costumer) == True
check.sum()
<ipython-input-62-21e4f303ec75>:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
check = df_group.groupby("Customer_ID").apply(check_delays_each_costumer) == True
0
# Line Plot
plt.figure(figsize=(10, 6))
delay_counts = df_group['Delay_from_due_date'].value_counts().sort_index()
plt.plot(delay_counts.index, delay_counts.values, marker='o', color='green')
plt.title('Line Plot of Variable: Delay_from_due_date', fontsize=14)
plt.xlabel('Delay_from_due_date', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.grid(True)
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x=df_group['Delay_from_due_date'], color='purple')
plt.title('Violin Plot of Variable: Delay_from_due_date', fontsize=14)
plt.xlabel('Delay_from_due_date', fontsize=12)
plt.grid(True)
plt.show()
df_group['Num_of_Delayed_Payment_Cleaned'] = df_group['Num_of_Delayed_Payment'].str.replace('_', '', regex=False).astype(float)
df_group[df_group['Num_of_Delayed_Payment_Cleaned'] < 10]['Num_of_Delayed_Payment_Cleaned'].value_counts().head()
| count | |
|---|---|
| Num_of_Delayed_Payment_Cleaned | |
| 9.0 | 7421 |
| 8.0 | 7303 |
| 7.0 | 3571 |
| 6.0 | 3435 |
| 5.0 | 3154 |
df_group[df_group['Num_of_Delayed_Payment_Cleaned'] > 25]['Num_of_Delayed_Payment_Cleaned'].value_counts().head(15)
| count | |
|---|---|
| Num_of_Delayed_Payment_Cleaned | |
| 26.0 | 472 |
| 27.0 | 354 |
| 28.0 | 196 |
| 3484.0 | 4 |
| 538.0 | 3 |
| 265.0 | 3 |
| 1150.0 | 3 |
| 1014.0 | 3 |
| 2801.0 | 3 |
| 4211.0 | 3 |
| 1946.0 | 3 |
| 2606.0 | 3 |
| 975.0 | 2 |
| 549.0 | 2 |
| 762.0 | 2 |
The highest value is 28, and any value above this can be considered an anomaly. Additionally, values below 0 are also anomalous, as they fall outside the expected range. These thresholds help in identifying outliers for further investigation or cleaning.
df_group.loc[(df_group["Num_of_Delayed_Payment_Cleaned"] < 0) | (df_group["Num_of_Delayed_Payment_Cleaned"] > 28),
"Num_of_Delayed_Payment_Cleaned"] = np.nan
df_group["Num_of_Delayed_Payment_Cleaned"] = df_group.groupby("Customer_ID")["Num_of_Delayed_Payment_Cleaned"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill"))
# Histogram
plt.hist(df_group["Num_of_Delayed_Payment_Cleaned"], bins=10, edgecolor='black')
plt.title('Distribution of Variable: Num_of_Delayed_Payment_Cleaned')
plt.xlabel('Num_of_Delayed_Payment_Cleaned')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group["Num_of_Delayed_Payment_Cleaned"], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_of_Delayed_Payment_Cleaned')
plt.xlabel('Num_of_Delayed_Payment_Cleaned')
plt.grid(True)
plt.show()
<ipython-input-66-a19df7a99c39>:5: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. lambda x: x.fillna(method="ffill").fillna(method="bfill"))
df_group['Changed_Credit_Limit'].value_counts().head(50)
| count | |
|---|---|
| Changed_Credit_Limit | |
| _ | 3150 |
| 11.5 | 197 |
| 11.32 | 189 |
| 8.22 | 189 |
| 7.35 | 181 |
| 10.06 | 178 |
| 8.23 | 169 |
| 7.69 | 166 |
| 7.01 | 165 |
| 11.49 | 164 |
| 7.33 | 163 |
| 9.25 | 162 |
| 3.93 | 161 |
| 1.63 | 159 |
| 8.99 | 156 |
| 8.3 | 156 |
| 7.63 | 155 |
| 8.54 | 153 |
| 8.82 | 152 |
| 7.23 | 152 |
| 9.58 | 151 |
| 11.73 | 151 |
| 10.3 | 151 |
| 8.76 | 149 |
| 9.13 | 149 |
| 11.63 | 148 |
| 11.78 | 148 |
| 11.95 | 148 |
| 9.88 | 146 |
| 8.04 | 146 |
| 9.2 | 145 |
| 7.06 | 144 |
| 7.66 | 143 |
| 10.54 | 143 |
| 8.34 | 141 |
| 7.64 | 140 |
| 4.92 | 140 |
| 4.86 | 140 |
| 9.09 | 139 |
| 8.74 | 138 |
| 9.18 | 138 |
| 1.59 | 138 |
| 5.99 | 137 |
| 9.97 | 137 |
| 7.91 | 137 |
| 0.57 | 136 |
| 8.56 | 136 |
| 11.51 | 136 |
| 8.67 | 136 |
| 10.64 | 135 |
df_group.loc[(df_group["Changed_Credit_Limit"] == "_"), "Changed_Credit_Limit"] = np.nan
df_group['Changed_Credit_Limit_Cleaned'] = pd.to_numeric(
df_group['Changed_Credit_Limit'].str.replace('_', '', regex=False),
errors='coerce'
)
df_group["Changed_Credit_Limit_Cleaned"] = df_group.groupby("Customer_ID")["Changed_Credit_Limit_Cleaned"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill"))
df_group['Changed_Credit_Limit_Cleaned'] = df_group['Changed_Credit_Limit_Cleaned'].round(3)
df_group['Changed_Credit_Limit_Cleaned'].value_counts()
<ipython-input-69-c3734b7ef243>:9: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. lambda x: x.fillna(method="ffill").fillna(method="bfill"))
| count | |
|---|---|
| Changed_Credit_Limit_Cleaned | |
| 11.50 | 198 |
| 8.22 | 196 |
| 11.32 | 190 |
| 7.35 | 186 |
| 7.69 | 181 |
| ... | ... |
| 32.11 | 1 |
| 24.27 | 1 |
| 31.15 | 1 |
| 29.20 | 1 |
| 29.17 | 1 |
3770 rows × 1 columns
# Histogram
sns.histplot(df_group['Changed_Credit_Limit_Cleaned'], kde=True, bins=30, color='lime')
plt.title('Distribution Distribution of Variable: Changed_Credit_Limit')
plt.xlabel('Changed_Credit_Limit')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Changed_Credit_Limit_Cleaned'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Changed_Credit_Limit')
plt.xlabel('Changed_Credit_Limit')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Credit_Inquiries'].dropna(), vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Credit_Inquiries')
plt.xlabel('Num_Credit_Inquiries')
plt.grid(True)
plt.show()
df_group[df_group['Num_Credit_Inquiries'] > 10]['Num_Credit_Inquiries'].value_counts().head(10)
| count | |
|---|---|
| Num_Credit_Inquiries | |
| 11.0 | 8047 |
| 12.0 | 7156 |
| 13.0 | 3545 |
| 14.0 | 2433 |
| 15.0 | 1871 |
| 16.0 | 1107 |
| 17.0 | 672 |
| 769.0 | 5 |
| 1460.0 | 5 |
| 1114.0 | 5 |
The highest value is 17, and any value above this can be considered an anomaly. This threshold helps identify outliers that deviate significantly from the expected range.
df_group.loc[(df_group["Num_Credit_Inquiries"] > 17), "Num_Credit_Inquiries"] = np.nan
df_group["Num_Credit_Inquiries"] = df_group.groupby("Customer_ID")["Num_Credit_Inquiries"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill"))
sns.histplot(df_group['Num_Credit_Inquiries'], kde=True, bins=30, color='lime')
plt.title('Distribution Distribution of Variable: Num_Credit_Inquiries')
plt.xlabel('Num_Credit_Inquiries')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Credit_Inquiries'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Credit_Inquiries')
plt.xlabel('Num_Credit_Inquiries')
plt.grid(True)
plt.show()
<ipython-input-73-9d75bb1fb014>:4: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. lambda x: x.fillna(method="ffill").fillna(method="bfill"))
plt.figure(figsize=(8, 6))
sns.countplot(x='Credit_Mix', data=df_group, palette='pastel')
plt.title('Distribution of Target Variable: Credit_Mix')
plt.xlabel('Credit_Mix')
plt.ylabel('Count')
plt.show()
<ipython-input-74-613dbba4248f>:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x='Credit_Mix', data=df_group, palette='pastel')
df_group.loc[(df_group["Credit_Mix"] == '_'), "Credit_Mix"] = np.nan
df_group["Credit_Mix"] = df_group.groupby("Customer_ID")["Credit_Mix"].transform(
lambda x: x.fillna(x.mode()[0]))
plt.figure(figsize=(8, 6))
sns.countplot(x='Credit_Mix', data=df_group, palette='pastel')
plt.title('Distribution of Target Variable: Credit_Mix')
plt.xlabel('Credit_Mix')
plt.ylabel('Count')
plt.show()
<ipython-input-75-9698a37e768e>:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x='Credit_Mix', data=df_group, palette='pastel')
df_group['Outstanding_Debt_Cleaned'] = pd.to_numeric(
df_group['Outstanding_Debt'].str.replace('_', '', regex=False),
errors='coerce'
)
# Histogram
sns.histplot(df_group['Outstanding_Debt_Cleaned'], kde=True, bins=30, color='yellow')
plt.title('Distribution of Variable: Outstanding_Debt')
plt.xlabel('Outstanding_Debt')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Outstanding_Debt_Cleaned'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Outstanding_Debt')
plt.xlabel('Outstanding_Debt')
plt.grid(True)
plt.show()
# Histogram
sns.histplot(df_group['Credit_Utilization_Ratio'], kde=True, bins=30, color='blue')
plt.title('Distribution Credit_Utilization_Ratio in Months')
plt.xlabel('Credit_Utilization_Ratio')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Credit_Utilization_Ratio'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Credit_Utilization_Ratio')
plt.xlabel('Credit_Utilization_Ratio')
plt.grid(True)
plt.show()
def convert_to_months(value):
if pd.isnull(value):
return np.nan
match = re.match(r'(\d+) Years and (\d+) Months', value)
if match:
years, months = int(match.group(1)), int(match.group(2))
return years * 12 + months
return np.nan
df_group['Credit_History_Age_in_Months'] = df_group['Credit_History_Age'].apply(convert_to_months)
df_group["Credit_History_Age_in_Months"].value_counts()
| count | |
|---|---|
| Credit_History_Age_in_Months | |
| 215.0 | 628 |
| 220.0 | 621 |
| 219.0 | 617 |
| 237.0 | 615 |
| 218.0 | 615 |
| ... | ... |
| 3.0 | 20 |
| 2.0 | 15 |
| 407.0 | 15 |
| 408.0 | 14 |
| 1.0 | 2 |
408 rows × 1 columns
df_group.groupby("Customer_ID")["Credit_History_Age_in_Months"].value_counts(
dropna=False).head(20)
| count | ||
|---|---|---|
| Customer_ID | Credit_History_Age_in_Months | |
| CUS_0x1000 | 122.0 | 1 |
| 123.0 | 1 | |
| 124.0 | 1 | |
| 125.0 | 1 | |
| 126.0 | 1 | |
| 127.0 | 1 | |
| 128.0 | 1 | |
| 129.0 | 1 | |
| 130.0 | 1 | |
| 131.0 | 1 | |
| 132.0 | 1 | |
| 133.0 | 1 | |
| CUS_0x1009 | 365.0 | 1 |
| 366.0 | 1 | |
| 367.0 | 1 | |
| 369.0 | 1 | |
| 370.0 | 1 | |
| 371.0 | 1 | |
| 372.0 | 1 | |
| 373.0 | 1 |
def fill_nan_credit(df):
# Forward fill: Isi NaN dengan nilai sebelumnya + 1
df['Credit_History_Age_in_Months'] = df['Credit_History_Age_in_Months'].fillna(method='ffill') + 1
# Backward fill: Isi NaN yang tersisa dengan nilai berikutnya - 1
df['Credit_History_Age_in_Months'] = df['Credit_History_Age_in_Months'].fillna(method='bfill') - 1
return df
df_group = df_group.groupby("Customer_ID", group_keys=False).apply(fill_nan_credit)
df_group['Credit_History_Age_in_Months'].isnull().sum()
<ipython-input-81-472e60c0e87f>:3: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
df['Credit_History_Age_in_Months'] = df['Credit_History_Age_in_Months'].fillna(method='ffill') + 1
<ipython-input-81-472e60c0e87f>:5: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
df['Credit_History_Age_in_Months'] = df['Credit_History_Age_in_Months'].fillna(method='bfill') - 1
<ipython-input-81-472e60c0e87f>:8: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
df_group = df_group.groupby("Customer_ID", group_keys=False).apply(fill_nan_credit)
0
plt.figure(figsize=(10, 5))
sns.histplot(df_group['Credit_History_Age_in_Months'], kde=True, bins=30, color='red')
plt.title('Distribution Credit History Age in Months')
plt.xlabel('Total Months')
plt.ylabel('Count')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Credit_History_Age_in_Months'], vert=False, patch_artist=True)
plt.title('Boxplot Credit History Age In Months')
plt.xlabel('Total Months')
plt.grid(True)
plt.show()
plt.figure(figsize=(8, 6))
sns.countplot(x='Payment_of_Min_Amount', data=df_group, palette='pastel')
plt.title('Distribution of Target Variable: Payment_of_Min_Amount')
plt.xlabel('Payment_of_Min_Amount')
plt.ylabel('Count')
plt.show()
<ipython-input-83-2601dd4ddbec>:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x='Payment_of_Min_Amount', data=df_group, palette='pastel')
df_group.groupby("Customer_ID")["Payment_of_Min_Amount"].value_counts(
dropna=False).head(20)
| count | ||
|---|---|---|
| Customer_ID | Payment_of_Min_Amount | |
| CUS_0x1000 | Yes | 12 |
| CUS_0x1009 | Yes | 11 |
| NM | 1 | |
| CUS_0x100b | No | 11 |
| NM | 1 | |
| CUS_0x1011 | Yes | 11 |
| NM | 1 | |
| CUS_0x1013 | No | 11 |
| NM | 1 | |
| CUS_0x1015 | Yes | 10 |
| NM | 2 | |
| CUS_0x1018 | Yes | 10 |
| NM | 2 | |
| CUS_0x1026 | No | 11 |
| NM | 1 | |
| CUS_0x102d | No | 10 |
| NM | 2 | |
| CUS_0x102e | Yes | 11 |
| NM | 1 | |
| CUS_0x1032 | Yes | 11 |
def replace_nm_with_majority(group):
majority_value = group['Payment_of_Min_Amount'][group['Payment_of_Min_Amount'] != 'NM'].mode()[0]
group['Payment_of_Min_Amount'] = group['Payment_of_Min_Amount'].replace('NM', majority_value)
return group
df_group = df_group.groupby('Customer_ID', group_keys=False).apply(replace_nm_with_majority)
df_group["Payment_of_Min_Amount"].isnull().sum()
<ipython-input-85-0baa9dd83691>:6: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
df_group = df_group.groupby('Customer_ID', group_keys=False).apply(replace_nm_with_majority)
0
plt.figure(figsize=(8, 6))
sns.countplot(x='Payment_of_Min_Amount', data=df_group, palette='pastel')
plt.title('Distribution of Target Variable: Payment_of_Min_Amount')
plt.xlabel('Payment_of_Min_Amount')
plt.ylabel('Count')
plt.show()
<ipython-input-86-2601dd4ddbec>:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x='Payment_of_Min_Amount', data=df_group, palette='pastel')
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Total_EMI_per_month'], vert=False, patch_artist=True)
plt.title('Boxplot Total_EMI_per_month')
plt.xlabel('Total_EMI_per_month')
plt.grid(True)
plt.show()
df_group['Total_EMI_per_month'].value_counts()
| count | |
|---|---|
| Total_EMI_per_month | |
| 0.000000 | 15615 |
| 49.574949 | 12 |
| 16.528703 | 12 |
| 64.443403 | 12 |
| 331.719510 | 12 |
| ... | ... |
| 10404.000000 | 1 |
| 15457.000000 | 1 |
| 59578.000000 | 1 |
| 43183.000000 | 1 |
| 33013.000000 | 1 |
16960 rows × 1 columns
df_group.loc[df_group["Total_EMI_per_month"] > df_group["Monthly_Inhand_Salary"] * 0.3]["Total_EMI_per_month"].value_counts()
| count | |
|---|---|
| Total_EMI_per_month | |
| 10335.0 | 4 |
| 17407.0 | 3 |
| 29766.0 | 3 |
| 26590.0 | 3 |
| 71993.0 | 3 |
| ... | ... |
| 53062.0 | 1 |
| 58509.0 | 1 |
| 24556.0 | 1 |
| 78530.0 | 1 |
| 33013.0 | 1 |
4633 rows × 1 columns
df_group.loc[df_group["Total_EMI_per_month"] > df_group["Monthly_Inhand_Salary"] * 0.3, "Total_EMI_per_month"] = np.nan
df_group["Total_EMI_per_month"] = df_group.groupby(
"Customer_ID")["Total_EMI_per_month"].transform(
lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x)
def fix_inconsistent_emi(group):
# Hitung nilai modus (mode) untuk setiap grup customer_id
mode_value = group['Total_EMI_per_month'].mode()
if not mode_value.empty:
# Ganti nilai yang hanya muncul 1 kali dan bukan nilai pertama atau terakhir
group['Total_EMI_per_month'] = group['Total_EMI_per_month'].apply(
lambda x: mode_value[0] if group['Total_EMI_per_month'].value_counts()[x] == 1 and
x != group['Total_EMI_per_month'].iloc[0] and x != group['Total_EMI_per_month'].iloc[-1] else x
)
return group
df_group = df_group.groupby("Customer_ID", group_keys=False).apply(fix_inconsistent_emi)
plt.figure(figsize=(10, 5))
sns.histplot(df_group["Total_EMI_per_month"], kde=True, bins=30, color='skyblue')
plt.title('Distribution Amount Invested Monthly')
plt.xlabel('Amount Invested Monthly')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Total_EMI_per_month'], vert=False, patch_artist=True)
plt.title('Boxplot Total_EMI_per_month')
plt.xlabel('Total_EMI_per_month')
plt.grid(True)
plt.show()
<ipython-input-90-f10def76ad72>:18: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
df_group = df_group.groupby("Customer_ID", group_keys=False).apply(fix_inconsistent_emi)
df_group['Amount_invested_monthly'].value_counts(dropna=False)
| count | |
|---|---|
| Amount_invested_monthly | |
| NaN | 6750 |
| __10000__ | 6480 |
| 0.0 | 275 |
| 80.41529543900253 | 1 |
| 16.53218878920387 | 1 |
| ... | ... |
| 209.17274569312266 | 1 |
| 333.0148085469461 | 1 |
| 274.68712877851107 | 1 |
| 220.58121173366908 | 1 |
| 220.45787812168732 | 1 |
136498 rows × 1 columns
df_group['Amount_invested_monthly_cleaned'] = (
df_group['Amount_invested_monthly']
.str.replace('__', '', regex=False)
.astype(float)
)
df_group["Amount_invested_monthly_cleaned"] = df_group.groupby("Customer_ID")["Amount_invested_monthly_cleaned"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill"))
df_group['Amount_invested_monthly_cleaned'] = df_group['Amount_invested_monthly_cleaned'].round(3)
# Histogram
plt.figure(figsize=(10, 5))
sns.histplot(df_group["Amount_invested_monthly_cleaned"], kde=True, bins=30, color='skyblue')
plt.title('Distribution Amount Invested Monthly')
plt.xlabel('Amount Invested Monthly')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# Boxplot
plt.figure(figsize=(5, 5))
plt.boxplot(df_group["Amount_invested_monthly_cleaned"], vert=False, patch_artist=True)
plt.title('Boxplot Amount Invested Monthly')
plt.xlabel('Amount')
plt.grid(True)
plt.show()
<ipython-input-92-1834a8f96089>:8: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. lambda x: x.fillna(method="ffill").fillna(method="bfill"))
plt.figure(figsize=(10, 6))
sns.countplot(y='Payment_Behaviour', data=df_group, palette='pastel', orient='h')
plt.title('Distribution of Target Variable: Payment_Behaviour')
plt.xlabel('Count')
plt.show()
<ipython-input-93-bb439be2e0aa>:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.countplot(y='Payment_Behaviour', data=df_group, palette='pastel', orient='h')
df_group.groupby(["Customer_ID"])["Payment_Behaviour"].nunique().head(20)
| Payment_Behaviour | |
|---|---|
| Customer_ID | |
| CUS_0x1000 | 6 |
| CUS_0x1009 | 5 |
| CUS_0x100b | 7 |
| CUS_0x1011 | 4 |
| CUS_0x1013 | 5 |
| CUS_0x1015 | 7 |
| CUS_0x1018 | 5 |
| CUS_0x1026 | 5 |
| CUS_0x102d | 6 |
| CUS_0x102e | 6 |
| CUS_0x1032 | 6 |
| CUS_0x1037 | 4 |
| CUS_0x1038 | 7 |
| CUS_0x103e | 4 |
| CUS_0x1041 | 5 |
| CUS_0x1044 | 6 |
| CUS_0x1048 | 7 |
| CUS_0x104a | 4 |
| CUS_0x104e | 6 |
| CUS_0x104f | 4 |
df_group.loc[(df_group["Payment_Behaviour"] == "!@9#%8"), "Payment_Behaviour"] = np.nan
df_group["Payment_Behaviour"] = df_group.groupby("Customer_ID")["Payment_Behaviour"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill"))
plt.figure(figsize=(10, 6))
sns.countplot(y='Payment_Behaviour', data=df_group, palette='pastel', orient='h')
plt.title('Distribution of Target Variable: Payment_Behaviour')
plt.xlabel('Count')
plt.show()
<ipython-input-95-25f2ac98a2ae>:4: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. lambda x: x.fillna(method="ffill").fillna(method="bfill")) <ipython-input-95-25f2ac98a2ae>:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.countplot(y='Payment_Behaviour', data=df_group, palette='pastel', orient='h')
df_group['Monthly_Balance'].value_counts(dropna=False)
| count | |
|---|---|
| Monthly_Balance | |
| NaN | 1762 |
| __-333333333333333333333333333__ | 15 |
| 600.1125089726384 | 1 |
| 120.54247208897846 | 1 |
| 784.0174708573453 | 1 |
| ... | ... |
| 279.9825060023594 | 1 |
| 260.67157274114965 | 1 |
| 250.4915113003876 | 1 |
| 243.8753153006728 | 1 |
| 360.37968260123847 | 1 |
148225 rows × 1 columns
df_group[df_group['Monthly_Balance'].str.contains('__', na=False)]['Monthly_Balance'].value_counts()
| count | |
|---|---|
| Monthly_Balance | |
| __-333333333333333333333333333__ | 15 |
df_group['Monthly_Balance_Cleaned'] = df_group['Monthly_Balance'].replace(
"__-333333333333333333333333333__", 0)
df_group['Monthly_Balance_Cleaned'] = pd.to_numeric(df_group['Monthly_Balance_Cleaned'], errors='coerce')
df_group['Monthly_Balance_Cleaned'] = df_group['Monthly_Balance_Cleaned'].round(3)
customer = df_group[df_group['Monthly_Balance_Cleaned'] == 0]
df_group[df_group['Customer_ID'].isin(customer['Customer_ID'])][['Customer_ID', 'Monthly_Balance_Cleaned']].head(10)
| Customer_ID | Monthly_Balance_Cleaned | |
|---|---|---|
| 5544 | CUS_0x9885 | 423.397 |
| 5545 | CUS_0x9885 | 0.000 |
| 5546 | CUS_0x9885 | 278.412 |
| 5547 | CUS_0x9885 | 420.557 |
| 5548 | CUS_0x9885 | 71.288 |
| 5549 | CUS_0x9885 | 383.284 |
| 5550 | CUS_0x9885 | 229.007 |
| 5551 | CUS_0x9885 | 374.031 |
| 22720 | CUS_0x4379 | 317.267 |
| 22721 | CUS_0x4379 | 290.461 |
df_group.loc[df_group["Monthly_Balance_Cleaned"] == 0, "Monthly_Balance_Cleaned"] = df_group.loc[df_group["Monthly_Balance_Cleaned"] == 0, "Customer_ID"].map(
df_group[df_group["Monthly_Balance_Cleaned"] != 0].groupby(
"Customer_ID")["Monthly_Balance_Cleaned"].mean())
df_group["Monthly_Balance_Cleaned"] = df_group.groupby(
"Customer_ID")["Monthly_Balance_Cleaned"].transform(lambda x: x.fillna(x.mean()))
plt.figure(figsize=(10, 5))
sns.histplot(df_group["Monthly_Balance_Cleaned"], kde=True, bins=30, color='green')
plt.title('Distribusi Monthly Balance')
plt.xlabel('Monthly Balance')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 5))
plt.boxplot(df_group["Monthly_Balance_Cleaned"], vert=False, patch_artist=True)
plt.title('Boxplot Monthly Balance')
plt.xlabel('Monthly Balance')
plt.grid(True)
plt.show()
df_group = df_group.drop(columns=['Age','SSN', 'Is_Valid_SSN', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Outstanding_Debt', 'Credit_History_Age', 'Amount_invested_monthly', 'Monthly_Balance'])
df_group.info()
<class 'pandas.core.frame.DataFrame'> Index: 150000 entries, 0 to 149999 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 150000 non-null object 1 Customer_ID 150000 non-null object 2 Month 150000 non-null object 3 Name 150000 non-null object 4 Occupation 150000 non-null object 5 Monthly_Inhand_Salary 150000 non-null float64 6 Num_Bank_Accounts 150000 non-null float64 7 Num_Credit_Card 150000 non-null float64 8 Interest_Rate 150000 non-null float64 9 Delay_from_due_date 150000 non-null int64 10 Num_Credit_Inquiries 150000 non-null float64 11 Credit_Mix 150000 non-null object 12 Credit_Utilization_Ratio 150000 non-null float64 13 Payment_of_Min_Amount 150000 non-null object 14 Total_EMI_per_month 150000 non-null float64 15 Payment_Behaviour 150000 non-null object 16 Credit_Score 100000 non-null object 17 is_train 150000 non-null int64 18 Age_Cleaned 150000 non-null int64 19 SSN_Cleaned 150000 non-null object 20 Annual_Income_Cleaned 150000 non-null float64 21 Num_of_Loan_Cleaned 150000 non-null float64 22 Type_of_Loan_Cleaned 150000 non-null object 23 Num_of_Delayed_Payment_Cleaned 150000 non-null float64 24 Changed_Credit_Limit_Cleaned 150000 non-null float64 25 Outstanding_Debt_Cleaned 150000 non-null float64 26 Credit_History_Age_in_Months 150000 non-null float64 27 Amount_invested_monthly_cleaned 150000 non-null float64 28 Monthly_Balance_Cleaned 150000 non-null float64 dtypes: float64(15), int64(3), object(11) memory usage: 38.4+ MB
df_group
| ID | Customer_ID | Month | Name | Occupation | Monthly_Inhand_Salary | Num_Bank_Accounts | Num_Credit_Card | Interest_Rate | Delay_from_due_date | ... | SSN_Cleaned | Annual_Income_Cleaned | Num_of_Loan_Cleaned | Type_of_Loan_Cleaned | Num_of_Delayed_Payment_Cleaned | Changed_Credit_Limit_Cleaned | Outstanding_Debt_Cleaned | Credit_History_Age_in_Months | Amount_invested_monthly_cleaned | Monthly_Balance_Cleaned | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0x1602 | CUS_0xd40 | January | Aaron Maashoh | Scientist | 1824.843333 | 3.0 | 4.0 | 3.0 | 3 | ... | 821-00-0265 | 19114.12 | 4.0 | Auto Loan, Credit-Builder Loan, Home Equity Lo... | 7.0 | 11.27 | 809.98 | 265.0 | 80.415 | 312.494 |
| 1 | 0x1603 | CUS_0xd40 | February | Aaron Maashoh | Scientist | 1824.843333 | 3.0 | 4.0 | 3.0 | -1 | ... | 821-00-0265 | 19114.12 | 4.0 | Auto Loan, Credit-Builder Loan, Home Equity Lo... | 7.0 | 11.27 | 809.98 | 265.0 | 118.280 | 284.629 |
| 2 | 0x1604 | CUS_0xd40 | March | Aaron Maashoh | Scientist | 1824.843333 | 3.0 | 4.0 | 3.0 | 3 | ... | 821-00-0265 | 19114.12 | 4.0 | Auto Loan, Credit-Builder Loan, Home Equity Lo... | 7.0 | 11.27 | 809.98 | 267.0 | 81.700 | 331.210 |
| 3 | 0x1605 | CUS_0xd40 | April | Aaron Maashoh | Scientist | 1824.843333 | 3.0 | 4.0 | 3.0 | 5 | ... | 821-00-0265 | 19114.12 | 4.0 | Auto Loan, Credit-Builder Loan, Home Equity Lo... | 4.0 | 6.27 | 809.98 | 268.0 | 199.458 | 223.451 |
| 4 | 0x1606 | CUS_0xd40 | May | Aaron Maashoh | Scientist | 1824.843333 | 3.0 | 4.0 | 3.0 | 6 | ... | 821-00-0265 | 19114.12 | 4.0 | Auto Loan, Credit-Builder Loan, Home Equity Lo... | 4.0 | 11.27 | 809.98 | 269.0 | 41.420 | 341.489 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 149995 | 0x25fe5 | CUS_0x8600 | December | Sarah McBridec | Architect | 1929.906667 | 10.0 | 8.0 | 29.0 | 33 | ... | 031-35-0942 | 20002.88 | 5.0 | Auto Loan, Mortgage Loan, Personal Loan, Stude... | 25.0 | 18.31 | 3571.70 | 78.0 | 146.486 | 275.540 |
| 149996 | 0x25fee | CUS_0x942c | September | Nicks | Mechanic | 3359.415833 | 4.0 | 6.0 | 7.0 | 20 | ... | 078-73-5990 | 39628.99 | 2.0 | Auto Loan, Student Loan | 6.0 | 11.50 | 502.38 | 383.0 | 181.443 | 409.395 |
| 149997 | 0x25fef | CUS_0x942c | October | Nicks | Mechanic | 3359.415833 | 4.0 | 6.0 | 7.0 | 23 | ... | 078-73-5990 | 39628.99 | 2.0 | Auto Loan, Student Loan | 5.0 | 13.50 | 502.38 | 384.0 | 10000.000 | 349.726 |
| 149998 | 0x25ff0 | CUS_0x942c | November | Nicks | Mechanic | 3359.415833 | 4.0 | 6.0 | 7.0 | 21 | ... | 078-73-5990 | 39628.99 | 2.0 | Auto Loan, Student Loan | 6.0 | 11.50 | 502.38 | 385.0 | 97.599 | 463.239 |
| 149999 | 0x25ff1 | CUS_0x942c | December | Nicks | Mechanic | 3359.415833 | 4.0 | 6.0 | 7.0 | 22 | ... | 078-73-5990 | 39628.99 | 2.0 | Auto Loan, Student Loan | 5.0 | 11.50 | 502.38 | 386.0 | 220.458 | 360.380 |
150000 rows × 29 columns
Exploratory Data Analysis After Cleaning¶
plt.figure(figsize=(20, 40))
numeric = df_group.select_dtypes(include=['number']).columns # Select numerical columns
for i, col in enumerate(numeric):
plt.subplot(9, 3, i + 1)
sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
plt.title(f'Boxplot of {col} by Credit Score', fontsize=13, pad=10)
plt.xlabel('Credit Score', fontsize=13)
plt.ylabel(col, fontsize=13)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
min_val = df_group[col].min()
max_val = df_group[col].max()
margin = (max_val - min_val) * 0.1
plt.ylim(min_val - margin, max_val + margin)
plt.subplots_adjust(wspace=0.4, hspace=0.7)
plt.tight_layout()
plt.show()
<ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1') <ipython-input-102-a2600b2eb564>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
numeric = df_group.select_dtypes(exclude=object).columns
fig, ax = plt.subplots(figsize=(20, 35))
fig.patch.set_facecolor('white')
mpl.rcParams['font.family'] = 'DejaVu Sans'
mpl.rcParams['font.size'] = 12
colors = sns.color_palette("tab10", n_colors=len(numeric)).as_hex()
for i, col in enumerate(numeric):
plt.subplot(9, 3, i + 1)
sns.kdeplot(x=df_group[col], color=colors[i], fill=True)
plt.title(f'Distribution of {col}', fontsize=10)
plt.grid(visible=False)
plt.tight_layout()
plt.show()
import numpy as np
plt.figure(figsize=(10, len(numeric) * 4))
for i, col in enumerate(numeric):
try:
if df_group[col].nunique() > 1: # Ensure column has sufficient variance
plt.subplot(len(numeric), 1, i + 1)
sns.histplot(
data=df_group,
x=col,
hue='Credit_Score',
kde=True,
bins=30,
palette='Set2'
)
plt.title(f'Distribution of {col} by Credit Score', fontsize=13, pad=10)
plt.xlabel(col, fontsize=13)
plt.ylabel('Frequency', fontsize=13)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
else:
print(f"Skipping column '{col}' due to insufficient variance.")
except np.linalg.LinAlgError:
print(f"KDE failed for column '{col}', disabling KDE.")
plt.subplot(len(numeric), 1, i + 1)
sns.histplot(
data=df_group,
x=col,
hue='Credit_Score',
kde=False, # Disable KDE if it fails
bins=30,
palette='Set2'
)
plt.title(f'Distribution of {col} by Credit Score (KDE Disabled)', fontsize=13, pad=10)
plt.xlabel(col, fontsize=13)
plt.ylabel('Frequency', fontsize=13)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.subplots_adjust(wspace=0.4, hspace=0.7)
plt.tight_layout()
plt.show()
KDE failed for column 'is_train', disabling KDE.
plt.figure(figsize=(17,10))
cor = df_group[numeric].corr(method="kendall")
sns.heatmap(cor, annot=True)
plt.title('"Correlation of Numeric Features"')
plt.show()
Feature Selection¶
object_columns = df_group.select_dtypes(include=['object']).columns
object_columns = object_columns[object_columns != 'test_yes']
print(object_columns)
Index(['ID', 'Customer_ID', 'Month', 'Name', 'Occupation', 'Credit_Mix',
'Payment_of_Min_Amount', 'Payment_Behaviour', 'Credit_Score',
'SSN_Cleaned', 'Type_of_Loan_Cleaned'],
dtype='object')
for col in object_columns:
unique_values = df_group[col].nunique()
print(f"Different and unique columns {col}: ({unique_values})")
Different and unique columns ID: (150000) Different and unique columns Customer_ID: (12500) Different and unique columns Month: (12) Different and unique columns Name: (10139) Different and unique columns Occupation: (15) Different and unique columns Credit_Mix: (3) Different and unique columns Payment_of_Min_Amount: (2) Different and unique columns Payment_Behaviour: (6) Different and unique columns Credit_Score: (3) Different and unique columns SSN_Cleaned: (12500) Different and unique columns Type_of_Loan_Cleaned: (508)
month_to_number = {
'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5,
'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10,
'November': 11, 'December': 12
}
df_group['Month_Number'] = df_group['Month'].map(month_to_number)
Credit_Mix_to_Score = {
'Good': 2,
'Standard': 1,
'Bad': 0
}
df_group['Credit_Mix_Score'] = df_group['Credit_Mix'].map(Credit_Mix_to_Score)
Payment_of_Min_Amount_to_bolean = {
'Yes': 1,
'No': 0
}
df_group['Payment_of_Min_Amount_Yes'] = df_group['Payment_of_Min_Amount'].map(Payment_of_Min_Amount_to_bolean)
Payment_Behaviour_to_Score = {
'Low_spent_Small_value_payments': 1,
'Low_spent_Medium_value_payments': 2,
'Low_spent_Large_value_payments': 3,
'High_spent_Small_value_payments': 4,
'High_spent_Medium_value_payments': 5,
'High_spent_Large_value_payments': 6
}
df_group['Payment_Behaviour_Score'] = df_group['Payment_Behaviour'].map(Payment_Behaviour_to_Score)
df_group['Credit_Score'].unique()
array(['Good', 'Standard', 'Poor', nan], dtype=object)
Credit_Score_to_Number = {
'Good': 2,
'Standard': 1,
'Poor': 0
}
df_group['Credit_Score'] = df_group['Credit_Score'].map(Credit_Score_to_Number)
plt.figure(figsize=(17,10))
cor = df_group[numeric].corr(method="kendall")
sns.heatmap(cor, annot=True)
plt.title('Correlation of Numeric Features After Feature Engineering')
plt.show()
loan = df_group
loan['Type_of_Loan_Cleaned'] = loan['Type_of_Loan_Cleaned'].str.strip()
loan['Type_of_Loan_Cleaned'] = loan['Type_of_Loan_Cleaned'].str.replace(' ', '_', regex=True)
loan['Type_of_Loan_Cleaned'] = loan['Type_of_Loan_Cleaned'].str.replace('-', '_', regex=True)
loan = loan['Type_of_Loan_Cleaned'].str.get_dummies(sep=',_')
df_group = pd.concat([df_group, loan], axis=1)
df_group = pd.get_dummies(df_group, columns=['Occupation'])
df_group = df_group.select_dtypes(exclude=['object'])
df_group = df_group.drop(columns=['Month_Number'])
df_group
| Monthly_Inhand_Salary | Num_Bank_Accounts | Num_Credit_Card | Interest_Rate | Delay_from_due_date | Num_Credit_Inquiries | Credit_Utilization_Ratio | Total_EMI_per_month | Credit_Score | is_train | ... | Occupation_Entrepreneur | Occupation_Journalist | Occupation_Lawyer | Occupation_Manager | Occupation_Mechanic | Occupation_Media_Manager | Occupation_Musician | Occupation_Scientist | Occupation_Teacher | Occupation_Writer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1824.843333 | 3.0 | 4.0 | 3.0 | 3 | 4.0 | 26.822620 | 49.574949 | 2.0 | 1 | ... | False | False | False | False | False | False | False | True | False | False |
| 1 | 1824.843333 | 3.0 | 4.0 | 3.0 | -1 | 4.0 | 31.944960 | 49.574949 | 2.0 | 1 | ... | False | False | False | False | False | False | False | True | False | False |
| 2 | 1824.843333 | 3.0 | 4.0 | 3.0 | 3 | 4.0 | 28.609352 | 49.574949 | 2.0 | 1 | ... | False | False | False | False | False | False | False | True | False | False |
| 3 | 1824.843333 | 3.0 | 4.0 | 3.0 | 5 | 4.0 | 31.377862 | 49.574949 | 2.0 | 1 | ... | False | False | False | False | False | False | False | True | False | False |
| 4 | 1824.843333 | 3.0 | 4.0 | 3.0 | 6 | 4.0 | 24.797347 | 49.574949 | 2.0 | 1 | ... | False | False | False | False | False | False | False | True | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 149995 | 1929.906667 | 10.0 | 8.0 | 29.0 | 33 | 12.0 | 34.780553 | 60.964772 | NaN | 0 | ... | False | False | False | False | False | False | False | False | False | False |
| 149996 | 3359.415833 | 4.0 | 6.0 | 7.0 | 20 | 7.0 | 27.758522 | 35.104023 | NaN | 0 | ... | False | False | False | False | True | False | False | False | False | False |
| 149997 | 3359.415833 | 4.0 | 6.0 | 7.0 | 23 | 7.0 | 36.858542 | 35.104023 | NaN | 0 | ... | False | False | False | False | True | False | False | False | False | False |
| 149998 | 3359.415833 | 4.0 | 6.0 | 7.0 | 21 | 7.0 | 39.139840 | 35.104023 | NaN | 0 | ... | False | False | False | False | True | False | False | False | False | False |
| 149999 | 3359.415833 | 4.0 | 6.0 | 7.0 | 22 | 7.0 | 34.108530 | 35.104023 | NaN | 0 | ... | False | False | False | False | True | False | False | False | False | False |
150000 rows × 47 columns
data_train = df_group[df_group['is_train'] == 0].drop(columns=['is_train'])
data_test = df_group[df_group['is_train'] == 1].drop(columns=['is_train'])
Credit_Score = data_train.pop('Credit_Score')
data_train['Credit_Score'] = Credit_Score
data_test = data_test.drop(columns=['Credit_Score'])
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100000 entries, 0 to 99999 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 100000 non-null object 1 Customer_ID 100000 non-null object 2 Month 100000 non-null object 3 Name 90015 non-null object 4 Age 100000 non-null object 5 SSN 100000 non-null object 6 Occupation 100000 non-null object 7 Annual_Income 100000 non-null object 8 Monthly_Inhand_Salary 84998 non-null float64 9 Num_Bank_Accounts 100000 non-null int64 10 Num_Credit_Card 100000 non-null int64 11 Interest_Rate 100000 non-null int64 12 Num_of_Loan 100000 non-null object 13 Type_of_Loan 88592 non-null object 14 Delay_from_due_date 100000 non-null int64 15 Num_of_Delayed_Payment 92998 non-null object 16 Changed_Credit_Limit 100000 non-null object 17 Num_Credit_Inquiries 98035 non-null float64 18 Credit_Mix 100000 non-null object 19 Outstanding_Debt 100000 non-null object 20 Credit_Utilization_Ratio 100000 non-null float64 21 Credit_History_Age 90970 non-null object 22 Payment_of_Min_Amount 100000 non-null object 23 Total_EMI_per_month 100000 non-null float64 24 Amount_invested_monthly 95521 non-null object 25 Payment_Behaviour 100000 non-null object 26 Monthly_Balance 98800 non-null object 27 Credit_Score 100000 non-null object 28 is_train 100000 non-null int64 dtypes: float64(4), int64(5), object(20) memory usage: 22.1+ MB
df_test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50000 entries, 0 to 49999 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 50000 non-null object 1 Customer_ID 50000 non-null object 2 Month 50000 non-null object 3 Name 44985 non-null object 4 Age 50000 non-null object 5 SSN 50000 non-null object 6 Occupation 50000 non-null object 7 Annual_Income 50000 non-null object 8 Monthly_Inhand_Salary 42502 non-null float64 9 Num_Bank_Accounts 50000 non-null int64 10 Num_Credit_Card 50000 non-null int64 11 Interest_Rate 50000 non-null int64 12 Num_of_Loan 50000 non-null object 13 Type_of_Loan 44296 non-null object 14 Delay_from_due_date 50000 non-null int64 15 Num_of_Delayed_Payment 46502 non-null object 16 Changed_Credit_Limit 50000 non-null object 17 Num_Credit_Inquiries 48965 non-null float64 18 Credit_Mix 50000 non-null object 19 Outstanding_Debt 50000 non-null object 20 Credit_Utilization_Ratio 50000 non-null float64 21 Credit_History_Age 45530 non-null object 22 Payment_of_Min_Amount 50000 non-null object 23 Total_EMI_per_month 50000 non-null float64 24 Amount_invested_monthly 47729 non-null object 25 Payment_Behaviour 50000 non-null object 26 Monthly_Balance 49438 non-null object 27 is_train 50000 non-null int64 dtypes: float64(4), int64(5), object(19) memory usage: 10.7+ MB
df_group.head()
| Monthly_Inhand_Salary | Num_Bank_Accounts | Num_Credit_Card | Interest_Rate | Delay_from_due_date | Num_Credit_Inquiries | Credit_Utilization_Ratio | Total_EMI_per_month | Credit_Score | is_train | ... | Occupation_Entrepreneur | Occupation_Journalist | Occupation_Lawyer | Occupation_Manager | Occupation_Mechanic | Occupation_Media_Manager | Occupation_Musician | Occupation_Scientist | Occupation_Teacher | Occupation_Writer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1824.843333 | 3.0 | 4.0 | 3.0 | 3 | 4.0 | 26.822620 | 49.574949 | 2.0 | 1 | ... | False | False | False | False | False | False | False | True | False | False |
| 1 | 1824.843333 | 3.0 | 4.0 | 3.0 | -1 | 4.0 | 31.944960 | 49.574949 | 2.0 | 1 | ... | False | False | False | False | False | False | False | True | False | False |
| 2 | 1824.843333 | 3.0 | 4.0 | 3.0 | 3 | 4.0 | 28.609352 | 49.574949 | 2.0 | 1 | ... | False | False | False | False | False | False | False | True | False | False |
| 3 | 1824.843333 | 3.0 | 4.0 | 3.0 | 5 | 4.0 | 31.377862 | 49.574949 | 2.0 | 1 | ... | False | False | False | False | False | False | False | True | False | False |
| 4 | 1824.843333 | 3.0 | 4.0 | 3.0 | 6 | 4.0 | 24.797347 | 49.574949 | 2.0 | 1 | ... | False | False | False | False | False | False | False | True | False | False |
5 rows × 47 columns
from sklearn.preprocessing import LabelEncoder
categorical_columns = df_group.select_dtypes(include=['object', 'category']).columns
for col in categorical_columns:
le = LabelEncoder()
df_group[col] = le.fit_transform(df_group[col].astype(str))
print("Converted categorical features into numeric.")
Converted categorical features into numeric.
Modeling¶
Resampling¶
from imblearn.over_sampling import RandomOverSampler
X = df_train.drop(columns=['Credit_Score'])
y = df_train["Credit_Score"]
X, y = RandomOverSampler().fit_resample(X, y)
df_resampling = pd.concat([X, y], axis=1)
plt.figure(figsize=(8, 6))
sns.countplot(x='Credit_Score', data=df_resampling, palette='pastel')
plt.title('Distribution of Target Variable: Credit_Score After Resampling')
plt.xlabel('Credit_Score After Resampling')
plt.ylabel('Count')
plt.show()
<ipython-input-130-dc734f9f83f5>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x='Credit_Score', data=df_resampling, palette='pastel')
Split¶
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train',X_train.shape)
print('y_train',y_train.shape)
print('X_test',X_test.shape)
print('y_test',y_test.shape)
X_train (127617, 28) y_train (127617,) X_test (31905, 28) y_test (31905,)
XGBClassifier¶
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
df_group.fillna(0, inplace=True)
from sklearn.preprocessing import LabelEncoder
import pandas as pd
categorical_features = ['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Credit_Mix', 'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance']
for feature in categorical_features:
if feature in X_train.columns and feature in X_test.columns:
le = LabelEncoder()
le.fit(pd.concat([X_train[feature], X_test[feature]]).astype(str).unique())
X_train[feature] = le.transform(X_train[feature].astype(str))
X_test[feature] = le.transform(X_test[feature].astype(str))
else:
print(f"Feature '{feature}' not found in both X_train and X_test")
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)
imp = pd.Series(data=xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 12))
sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
plt.title("Feature Importance", fontsize=16)
plt.xlabel("Importance Score", fontsize=14)
plt.ylabel("Features", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [07:43:17] WARNING: /workspace/src/learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
<ipython-input-135-5b29f37426dc>:13: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print(classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, xgb_model.predict_proba(X_test), multi_class='ovr', average='weighted'))
Accuracy: 0.820122237893747
Precision: 0.8192873295571445
Recall: 0.820122237893747
precision recall f1-score support
0 0.82 0.93 0.87 10710
1 0.83 0.86 0.84 10578
2 0.81 0.67 0.73 10617
accuracy 0.82 31905
macro avg 0.82 0.82 0.82 31905
weighted avg 0.82 0.82 0.82 31905
AUC-ROC Score: 0.9312729492479699
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 10],
'learning_rate': [0.01, 0.1, 0.2],
'subsample': [0.8, 1.0]
}
grid_search = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best Parameters XGBClassifier:", grid_search.best_params_)
/usr/local/lib/python3.10/dist-packages/joblib/externals/loky/process_executor.py:752: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [08:41:17] WARNING: /workspace/src/learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
Best Parameters XGBClassifier: {'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 200, 'subsample': 0.8}
best_params = grid_search.best_params_
optimized_xgb = XGBClassifier(
use_label_encoder=False,
eval_metric='mlogloss',
n_estimators=best_params['n_estimators'],
max_depth=best_params['max_depth'],
learning_rate=best_params['learning_rate'],
subsample=best_params['subsample']
)
eval_set = [(X_train, y_train), (X_test, y_test)]
optimized_xgb.fit(X_train, y_train, eval_set=eval_set, verbose=True)
results_optimized_xgb = optimized_xgb.evals_result()
epochs = len(results_optimized_xgb['validation_0']['mlogloss'])
x_axis = range(0, epochs)
plt.figure(figsize=(10, 6))
plt.plot(x_axis, results_optimized_xgb['validation_0']['mlogloss'], label='Train')
plt.plot(x_axis, results_optimized_xgb['validation_1']['mlogloss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.title('Training and Validation Log Loss')
plt.legend()
plt.grid()
plt.show()
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [08:48:40] WARNING: /workspace/src/learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
[0] validation_0-mlogloss:0.97079 validation_1-mlogloss:0.97546 [1] validation_0-mlogloss:0.87821 validation_1-mlogloss:0.88724 [2] validation_0-mlogloss:0.80828 validation_1-mlogloss:0.82175 [3] validation_0-mlogloss:0.75284 validation_1-mlogloss:0.77045 [4] validation_0-mlogloss:0.70945 validation_1-mlogloss:0.73094 [5] validation_0-mlogloss:0.67420 validation_1-mlogloss:0.69953 [6] validation_0-mlogloss:0.64561 validation_1-mlogloss:0.67437 [7] validation_0-mlogloss:0.62000 validation_1-mlogloss:0.65264 [8] validation_0-mlogloss:0.59681 validation_1-mlogloss:0.63383 [9] validation_0-mlogloss:0.57665 validation_1-mlogloss:0.61815 [10] validation_0-mlogloss:0.55835 validation_1-mlogloss:0.60361 [11] validation_0-mlogloss:0.54008 validation_1-mlogloss:0.58900 [12] validation_0-mlogloss:0.52510 validation_1-mlogloss:0.57715 [13] validation_0-mlogloss:0.51202 validation_1-mlogloss:0.56692 [14] validation_0-mlogloss:0.49934 validation_1-mlogloss:0.55724 [15] validation_0-mlogloss:0.48750 validation_1-mlogloss:0.54842 [16] validation_0-mlogloss:0.47507 validation_1-mlogloss:0.53940 [17] validation_0-mlogloss:0.46492 validation_1-mlogloss:0.53249 [18] validation_0-mlogloss:0.45438 validation_1-mlogloss:0.52438 [19] validation_0-mlogloss:0.44542 validation_1-mlogloss:0.51793 [20] validation_0-mlogloss:0.43685 validation_1-mlogloss:0.51154 [21] validation_0-mlogloss:0.42769 validation_1-mlogloss:0.50463 [22] validation_0-mlogloss:0.41912 validation_1-mlogloss:0.49758 [23] validation_0-mlogloss:0.41174 validation_1-mlogloss:0.49241 [24] validation_0-mlogloss:0.40445 validation_1-mlogloss:0.48685 [25] validation_0-mlogloss:0.39623 validation_1-mlogloss:0.48070 [26] validation_0-mlogloss:0.39003 validation_1-mlogloss:0.47576 [27] validation_0-mlogloss:0.38500 validation_1-mlogloss:0.47176 [28] validation_0-mlogloss:0.38015 validation_1-mlogloss:0.46804 [29] validation_0-mlogloss:0.37395 validation_1-mlogloss:0.46347 [30] validation_0-mlogloss:0.36709 validation_1-mlogloss:0.45814 [31] validation_0-mlogloss:0.36088 validation_1-mlogloss:0.45349 [32] validation_0-mlogloss:0.35700 validation_1-mlogloss:0.45057 [33] validation_0-mlogloss:0.35281 validation_1-mlogloss:0.44739 [34] validation_0-mlogloss:0.34783 validation_1-mlogloss:0.44391 [35] validation_0-mlogloss:0.34205 validation_1-mlogloss:0.43932 [36] validation_0-mlogloss:0.33694 validation_1-mlogloss:0.43576 [37] validation_0-mlogloss:0.33233 validation_1-mlogloss:0.43261 [38] validation_0-mlogloss:0.32824 validation_1-mlogloss:0.42983 [39] validation_0-mlogloss:0.32345 validation_1-mlogloss:0.42682 [40] validation_0-mlogloss:0.31951 validation_1-mlogloss:0.42355 [41] validation_0-mlogloss:0.31562 validation_1-mlogloss:0.42101 [42] validation_0-mlogloss:0.31141 validation_1-mlogloss:0.41789 [43] validation_0-mlogloss:0.30756 validation_1-mlogloss:0.41520 [44] validation_0-mlogloss:0.30377 validation_1-mlogloss:0.41232 [45] validation_0-mlogloss:0.29928 validation_1-mlogloss:0.40887 [46] validation_0-mlogloss:0.29614 validation_1-mlogloss:0.40702 [47] validation_0-mlogloss:0.29114 validation_1-mlogloss:0.40337 [48] validation_0-mlogloss:0.28680 validation_1-mlogloss:0.39978 [49] validation_0-mlogloss:0.28319 validation_1-mlogloss:0.39753 [50] validation_0-mlogloss:0.27714 validation_1-mlogloss:0.39267 [51] validation_0-mlogloss:0.27181 validation_1-mlogloss:0.38874 [52] validation_0-mlogloss:0.26847 validation_1-mlogloss:0.38657 [53] validation_0-mlogloss:0.26530 validation_1-mlogloss:0.38434 [54] validation_0-mlogloss:0.26186 validation_1-mlogloss:0.38180 [55] validation_0-mlogloss:0.25868 validation_1-mlogloss:0.38006 [56] validation_0-mlogloss:0.25597 validation_1-mlogloss:0.37822 [57] validation_0-mlogloss:0.25279 validation_1-mlogloss:0.37646 [58] validation_0-mlogloss:0.24998 validation_1-mlogloss:0.37440 [59] validation_0-mlogloss:0.24674 validation_1-mlogloss:0.37217 [60] validation_0-mlogloss:0.24340 validation_1-mlogloss:0.37004 [61] validation_0-mlogloss:0.23985 validation_1-mlogloss:0.36786 [62] validation_0-mlogloss:0.23592 validation_1-mlogloss:0.36534 [63] validation_0-mlogloss:0.23195 validation_1-mlogloss:0.36259 [64] validation_0-mlogloss:0.22845 validation_1-mlogloss:0.36053 [65] validation_0-mlogloss:0.22536 validation_1-mlogloss:0.35843 [66] validation_0-mlogloss:0.22218 validation_1-mlogloss:0.35683 [67] validation_0-mlogloss:0.21900 validation_1-mlogloss:0.35462 [68] validation_0-mlogloss:0.21575 validation_1-mlogloss:0.35273 [69] validation_0-mlogloss:0.21299 validation_1-mlogloss:0.35084 [70] validation_0-mlogloss:0.20951 validation_1-mlogloss:0.34865 [71] validation_0-mlogloss:0.20664 validation_1-mlogloss:0.34659 [72] validation_0-mlogloss:0.20359 validation_1-mlogloss:0.34475 [73] validation_0-mlogloss:0.20142 validation_1-mlogloss:0.34325 [74] validation_0-mlogloss:0.19907 validation_1-mlogloss:0.34166 [75] validation_0-mlogloss:0.19664 validation_1-mlogloss:0.34010 [76] validation_0-mlogloss:0.19361 validation_1-mlogloss:0.33818 [77] validation_0-mlogloss:0.19128 validation_1-mlogloss:0.33634 [78] validation_0-mlogloss:0.18866 validation_1-mlogloss:0.33474 [79] validation_0-mlogloss:0.18685 validation_1-mlogloss:0.33360 [80] validation_0-mlogloss:0.18461 validation_1-mlogloss:0.33239 [81] validation_0-mlogloss:0.18244 validation_1-mlogloss:0.33113 [82] validation_0-mlogloss:0.18038 validation_1-mlogloss:0.32992 [83] validation_0-mlogloss:0.17756 validation_1-mlogloss:0.32806 [84] validation_0-mlogloss:0.17537 validation_1-mlogloss:0.32689 [85] validation_0-mlogloss:0.17322 validation_1-mlogloss:0.32594 [86] validation_0-mlogloss:0.17051 validation_1-mlogloss:0.32438 [87] validation_0-mlogloss:0.16720 validation_1-mlogloss:0.32254 [88] validation_0-mlogloss:0.16500 validation_1-mlogloss:0.32124 [89] validation_0-mlogloss:0.16290 validation_1-mlogloss:0.32031 [90] validation_0-mlogloss:0.16044 validation_1-mlogloss:0.31882 [91] validation_0-mlogloss:0.15836 validation_1-mlogloss:0.31762 [92] validation_0-mlogloss:0.15641 validation_1-mlogloss:0.31644 [93] validation_0-mlogloss:0.15438 validation_1-mlogloss:0.31503 [94] validation_0-mlogloss:0.15279 validation_1-mlogloss:0.31429 [95] validation_0-mlogloss:0.15086 validation_1-mlogloss:0.31297 [96] validation_0-mlogloss:0.14914 validation_1-mlogloss:0.31206 [97] validation_0-mlogloss:0.14759 validation_1-mlogloss:0.31110 [98] validation_0-mlogloss:0.14576 validation_1-mlogloss:0.30998 [99] validation_0-mlogloss:0.14350 validation_1-mlogloss:0.30863 [100] validation_0-mlogloss:0.14180 validation_1-mlogloss:0.30766 [101] validation_0-mlogloss:0.13957 validation_1-mlogloss:0.30629 [102] validation_0-mlogloss:0.13779 validation_1-mlogloss:0.30554 [103] validation_0-mlogloss:0.13619 validation_1-mlogloss:0.30477 [104] validation_0-mlogloss:0.13426 validation_1-mlogloss:0.30354 [105] validation_0-mlogloss:0.13306 validation_1-mlogloss:0.30296 [106] validation_0-mlogloss:0.13139 validation_1-mlogloss:0.30212 [107] validation_0-mlogloss:0.12989 validation_1-mlogloss:0.30151 [108] validation_0-mlogloss:0.12812 validation_1-mlogloss:0.30063 [109] validation_0-mlogloss:0.12649 validation_1-mlogloss:0.29984 [110] validation_0-mlogloss:0.12508 validation_1-mlogloss:0.29911 [111] validation_0-mlogloss:0.12324 validation_1-mlogloss:0.29799 [112] validation_0-mlogloss:0.12197 validation_1-mlogloss:0.29728 [113] validation_0-mlogloss:0.12030 validation_1-mlogloss:0.29640 [114] validation_0-mlogloss:0.11892 validation_1-mlogloss:0.29571 [115] validation_0-mlogloss:0.11681 validation_1-mlogloss:0.29453 [116] validation_0-mlogloss:0.11585 validation_1-mlogloss:0.29404 [117] validation_0-mlogloss:0.11434 validation_1-mlogloss:0.29308 [118] validation_0-mlogloss:0.11305 validation_1-mlogloss:0.29229 [119] validation_0-mlogloss:0.11171 validation_1-mlogloss:0.29146 [120] validation_0-mlogloss:0.11063 validation_1-mlogloss:0.29114 [121] validation_0-mlogloss:0.10906 validation_1-mlogloss:0.29021 [122] validation_0-mlogloss:0.10765 validation_1-mlogloss:0.28949 [123] validation_0-mlogloss:0.10672 validation_1-mlogloss:0.28906 [124] validation_0-mlogloss:0.10530 validation_1-mlogloss:0.28821 [125] validation_0-mlogloss:0.10427 validation_1-mlogloss:0.28755 [126] validation_0-mlogloss:0.10327 validation_1-mlogloss:0.28687 [127] validation_0-mlogloss:0.10225 validation_1-mlogloss:0.28639 [128] validation_0-mlogloss:0.10118 validation_1-mlogloss:0.28594 [129] validation_0-mlogloss:0.10000 validation_1-mlogloss:0.28526 [130] validation_0-mlogloss:0.09897 validation_1-mlogloss:0.28481 [131] validation_0-mlogloss:0.09791 validation_1-mlogloss:0.28409 [132] validation_0-mlogloss:0.09667 validation_1-mlogloss:0.28360 [133] validation_0-mlogloss:0.09532 validation_1-mlogloss:0.28283 [134] validation_0-mlogloss:0.09409 validation_1-mlogloss:0.28228 [135] validation_0-mlogloss:0.09288 validation_1-mlogloss:0.28156 [136] validation_0-mlogloss:0.09151 validation_1-mlogloss:0.28087 [137] validation_0-mlogloss:0.09033 validation_1-mlogloss:0.28018 [138] validation_0-mlogloss:0.08961 validation_1-mlogloss:0.27995 [139] validation_0-mlogloss:0.08834 validation_1-mlogloss:0.27933 [140] validation_0-mlogloss:0.08725 validation_1-mlogloss:0.27897 [141] validation_0-mlogloss:0.08628 validation_1-mlogloss:0.27866 [142] validation_0-mlogloss:0.08537 validation_1-mlogloss:0.27822 [143] validation_0-mlogloss:0.08439 validation_1-mlogloss:0.27770 [144] validation_0-mlogloss:0.08361 validation_1-mlogloss:0.27737 [145] validation_0-mlogloss:0.08232 validation_1-mlogloss:0.27670 [146] validation_0-mlogloss:0.08134 validation_1-mlogloss:0.27616 [147] validation_0-mlogloss:0.08038 validation_1-mlogloss:0.27574 [148] validation_0-mlogloss:0.07939 validation_1-mlogloss:0.27530 [149] validation_0-mlogloss:0.07870 validation_1-mlogloss:0.27496 [150] validation_0-mlogloss:0.07797 validation_1-mlogloss:0.27459 [151] validation_0-mlogloss:0.07692 validation_1-mlogloss:0.27394 [152] validation_0-mlogloss:0.07626 validation_1-mlogloss:0.27359 [153] validation_0-mlogloss:0.07525 validation_1-mlogloss:0.27310 [154] validation_0-mlogloss:0.07438 validation_1-mlogloss:0.27277 [155] validation_0-mlogloss:0.07351 validation_1-mlogloss:0.27242 [156] validation_0-mlogloss:0.07256 validation_1-mlogloss:0.27217 [157] validation_0-mlogloss:0.07178 validation_1-mlogloss:0.27208 [158] validation_0-mlogloss:0.07114 validation_1-mlogloss:0.27180 [159] validation_0-mlogloss:0.07040 validation_1-mlogloss:0.27144 [160] validation_0-mlogloss:0.06968 validation_1-mlogloss:0.27107 [161] validation_0-mlogloss:0.06887 validation_1-mlogloss:0.27071 [162] validation_0-mlogloss:0.06806 validation_1-mlogloss:0.27028 [163] validation_0-mlogloss:0.06735 validation_1-mlogloss:0.27003 [164] validation_0-mlogloss:0.06671 validation_1-mlogloss:0.26983 [165] validation_0-mlogloss:0.06591 validation_1-mlogloss:0.26958 [166] validation_0-mlogloss:0.06515 validation_1-mlogloss:0.26920 [167] validation_0-mlogloss:0.06438 validation_1-mlogloss:0.26877 [168] validation_0-mlogloss:0.06357 validation_1-mlogloss:0.26827 [169] validation_0-mlogloss:0.06296 validation_1-mlogloss:0.26800 [170] validation_0-mlogloss:0.06228 validation_1-mlogloss:0.26777 [171] validation_0-mlogloss:0.06144 validation_1-mlogloss:0.26739 [172] validation_0-mlogloss:0.06064 validation_1-mlogloss:0.26718 [173] validation_0-mlogloss:0.05989 validation_1-mlogloss:0.26689 [174] validation_0-mlogloss:0.05919 validation_1-mlogloss:0.26666 [175] validation_0-mlogloss:0.05878 validation_1-mlogloss:0.26650 [176] validation_0-mlogloss:0.05841 validation_1-mlogloss:0.26634 [177] validation_0-mlogloss:0.05772 validation_1-mlogloss:0.26617 [178] validation_0-mlogloss:0.05711 validation_1-mlogloss:0.26595 [179] validation_0-mlogloss:0.05650 validation_1-mlogloss:0.26573 [180] validation_0-mlogloss:0.05573 validation_1-mlogloss:0.26535 [181] validation_0-mlogloss:0.05518 validation_1-mlogloss:0.26528 [182] validation_0-mlogloss:0.05445 validation_1-mlogloss:0.26492 [183] validation_0-mlogloss:0.05402 validation_1-mlogloss:0.26466 [184] validation_0-mlogloss:0.05344 validation_1-mlogloss:0.26446 [185] validation_0-mlogloss:0.05300 validation_1-mlogloss:0.26431 [186] validation_0-mlogloss:0.05247 validation_1-mlogloss:0.26408 [187] validation_0-mlogloss:0.05187 validation_1-mlogloss:0.26385 [188] validation_0-mlogloss:0.05123 validation_1-mlogloss:0.26374 [189] validation_0-mlogloss:0.05050 validation_1-mlogloss:0.26346 [190] validation_0-mlogloss:0.05006 validation_1-mlogloss:0.26333 [191] validation_0-mlogloss:0.04953 validation_1-mlogloss:0.26312 [192] validation_0-mlogloss:0.04883 validation_1-mlogloss:0.26300 [193] validation_0-mlogloss:0.04837 validation_1-mlogloss:0.26288 [194] validation_0-mlogloss:0.04782 validation_1-mlogloss:0.26273 [195] validation_0-mlogloss:0.04726 validation_1-mlogloss:0.26244 [196] validation_0-mlogloss:0.04678 validation_1-mlogloss:0.26216 [197] validation_0-mlogloss:0.04632 validation_1-mlogloss:0.26201 [198] validation_0-mlogloss:0.04577 validation_1-mlogloss:0.26198 [199] validation_0-mlogloss:0.04538 validation_1-mlogloss:0.26183
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# Assuming categorical_features contains your list of object type columns
categorical_features = ['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Credit_Mix', 'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance']
for feature in categorical_features:
if feature in X_train.columns and feature in X_test.columns:
le = LabelEncoder()
# Fit on the combined unique values from both train and test
le.fit(pd.concat([X_train[feature], X_test[feature]]).astype(str).unique())
X_train[feature] = le.transform(X_train[feature].astype(str))
X_test[feature] = le.transform(X_test[feature].astype(str))
else:
print(f"Feature '{feature}' not found in both X_train and X_test")
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str)) <ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_test[feature] = le.transform(X_test[feature].astype(str))
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# Assuming categorical_features contains your list of object type columns
categorical_features = ['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Credit_Mix', 'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance']
# Ensure X_test has the same columns as X_train used during training
X_test = X_test[[col for col in X_train.columns if col in X_test.columns]]
for feature in categorical_features:
if feature in X_train.columns and feature in X_test.columns:
le = LabelEncoder()
# Fit on the combined unique values from both train and test
le.fit(pd.concat([X_train[feature], X_test[feature]]).astype(str).unique())
X_train[feature] = le.transform(X_train[feature].astype(str))
X_test[feature] = le.transform(X_test[feature].astype(str))
else:
print(f"Feature '{feature}' not found in both X_train and X_test")
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str))
y_pred = optimized_xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print(classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, optimized_xgb.predict_proba(X_test), multi_class='ovr', average='weighted'))
Accuracy: 0.6871114299378288
Precision: 0.6917502759377729
Recall: 0.6871114299378288
precision recall f1-score support
0 0.62 0.74 0.67 1465
1 0.64 0.65 0.65 2488
2 0.74 0.69 0.72 4411
accuracy 0.69 8364
macro avg 0.67 0.69 0.68 8364
weighted avg 0.69 0.69 0.69 8364
AUC-ROC Score: 0.8248599898132657
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
X_Kfold = X_train
y_Kfold = y_train
optimized_xgb_kfold = optimized_xgb
# K-Fold Cross Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Manual K-Fold
f1_scores, precision_scores, recall_scores = [], [], []
# --- Fit LabelEncoders outside the loop ---
categorical_features = X.select_dtypes(include=['object']).columns
label_encoders = {} # Store LabelEncoders for each feature
for feature in categorical_features:
label_encoders[feature] = LabelEncoder()
# Fit on all data to avoid unseen labels
label_encoders[feature].fit(X[feature].astype(str).unique())
# --- Now iterate through folds ---
for train_index, test_index in kf.split(X_Kfold, y_Kfold):
# Select data using .iloc
X_train_fold = X.iloc[train_index]
X_test_fold = X.iloc[test_index]
y_train_fold = y.iloc[train_index]
y_test_fold = y.iloc[test_index]
# --- Apply Label Encoding using pre-fitted encoders ---
for feature in categorical_features:
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
# --- Convert all data to float after encoding ---
X_train_fold = X_train_fold.astype(float)
X_test_fold = X_test_fold.astype(float)
# Encode the target variable within the loop
le = LabelEncoder()
y_train_fold = le.fit_transform(y_train_fold)
y_test_fold = le.transform(y_test_fold)
optimized_xgb_kfold.fit(X_train_fold, y_train_fold) # Use the converted data
y_pred = optimized_xgb_kfold.predict(X_test_fold) # Use the converted data
f1_scores.append(f1_score(y_test_fold, y_pred, average='weighted'))
precision_scores.append(precision_score(y_test_fold, y_pred, average='weighted'))
recall_scores.append(recall_score(y_test_fold, y_pred, average='weighted'))
print("Mean F1-Score:", np.mean(f1_scores))
print("Mean Precision:", np.mean(precision_scores))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [09:01:03] WARNING: /workspace/src/learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [09:01:25] WARNING: /workspace/src/learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [09:01:48] WARNING: /workspace/src/learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [09:02:05] WARNING: /workspace/src/learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [09:02:22] WARNING: /workspace/src/learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
Mean F1-Score: 0.795492429540106 Mean Precision: 0.795518354123495
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# Assuming categorical_features contains your list of object type columns
categorical_features = ['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Credit_Mix', 'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance']
# Ensure X_test has the same columns as X_train used during training
# This line is crucial to ensure compatibility
X_test = X_test[[col for col in X_train.columns if col in X_test.columns]]
for feature in categorical_features:
if feature in X_train.columns and feature in X_test.columns:
le = LabelEncoder()
# Fit on the combined unique values from both train and test
le.fit(pd.concat([X_train[feature], X_test[feature]]).astype(str).unique())
X_train[feature] = le.transform(X_train[feature].astype(str))
X_test[feature] = le.transform(X_test[feature].astype(str))
else:
print(f"Feature '{feature}' not found in both X_train and X_test")
# --- Convert all data to float after encoding ---
# This line ensures XGBoost compatibility
X_test = X_test.astype(float)
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str)) <ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[feature] = le.transform(X_train[feature].astype(str))
RFC try¶
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
# Plot features importances
imp = pd.Series(data=rf_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,12))
plt.title("Feature importance")
ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
<ipython-input-160-7a7af09c5d56>:10: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
from sklearn.ensemble import RandomForestClassifier
train_scores = []
val_scores = []
for n_estimators in range(10, 201, 10):
rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
rf_model.fit(X_train, y_train)
train_pred = rf_model.predict(X_train)
train_acc = accuracy_score(y_train, train_pred)
train_scores.append(train_acc)
val_pred = rf_model.predict(X_test)
val_acc = accuracy_score(y_test, val_pred)
val_scores.append(val_acc)
plt.figure(figsize=(10, 6))
plt.plot(range(10, 201, 10), train_scores, label='Train Accuracy')
plt.plot(range(10, 201, 10), val_scores, label='Validation Accuracy')
plt.xlabel('Number of Trees (n_estimators)')
plt.ylabel('Accuracy')
plt.title('Random Forest Training vs Validation Accuracy')
plt.legend()
plt.grid()
plt.show()
final_rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_rf_model.fit(X_train, y_train)
y_pred = final_rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print(classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, final_rf_model.predict_proba(X_test), multi_class='ovr', average='weighted'))
Accuracy: 0.7882229861007324
Precision: 0.7883267561389053
Recall: 0.7882229861007324
precision recall f1-score support
0 0.73 0.71 0.72 1161
1 0.78 0.81 0.80 1908
2 0.81 0.80 0.81 3622
accuracy 0.79 6691
macro avg 0.77 0.78 0.77 6691
weighted avg 0.79 0.79 0.79 6691
AUC-ROC Score: 0.8948954966339161
LGBMClassifier¶
import lightgbm as lgb
lgbm = lgb.LGBMClassifier(objective='multiclass', random_state=42)
lgbm.fit(X_train, y_train)
# Plot features importances
imp = pd.Series(data=rf_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,12))
plt.title("Feature importance")
ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
/usr/local/lib/python3.10/dist-packages/dask/dataframe/__init__.py:42: FutureWarning: Dask dataframe query planning is disabled because dask-expr is not installed. You can install it with `pip install dask[dataframe]` or `conda install dask`. This will raise in a future version. warnings.warn(msg, FutureWarning)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003266 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 5084 [LightGBM] [Info] Number of data points in the train set: 26761, number of used features: 27 [LightGBM] [Info] Start training from score -1.714587 [LightGBM] [Info] Start training from score -1.222110 [LightGBM] [Info] Start training from score -0.643683
<ipython-input-163-ba426aead1c0>:10: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
lgbm.fit(X_train, y_train, eval_metric='multi_logloss', eval_set=eval_set)
results_lgbm = lgbm.evals_result_
epochs = len(results_lgbm['valid_0']['multi_logloss'])
x_axis = range(0, epochs)
plt.figure(figsize=(10, 6))
plt.plot(x_axis, results_lgbm['valid_0']['multi_logloss'], label='Train')
plt.plot(x_axis, results_lgbm['valid_1']['multi_logloss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.title('Training and Validation Log Loss')
plt.legend()
plt.grid()
plt.show()
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002209 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 5084 [LightGBM] [Info] Number of data points in the train set: 26761, number of used features: 27 [LightGBM] [Info] Start training from score -1.714587 [LightGBM] [Info] Start training from score -1.222110 [LightGBM] [Info] Start training from score -0.643683
y_pred = lgbm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print(classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, lgbm.predict_proba(X_test), multi_class='ovr', average='weighted'))
Accuracy: 0.7587804513525631
Precision: 0.7603072631063886
Recall: 0.7587804513525631
precision recall f1-score support
0 0.68 0.73 0.70 1161
1 0.75 0.75 0.75 1908
2 0.79 0.77 0.78 3622
accuracy 0.76 6691
macro avg 0.74 0.75 0.74 6691
weighted avg 0.76 0.76 0.76 6691
AUC-ROC Score: 0.8787710274426586
# @title
# prompt: target variable is credit_score make a deep learning model using pytoarch add more layers cause the model is underfitting the loss is nan i need good accuracy
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
# Define the neural network architecture with more layers
class CreditScoreClassifier(nn.Module):
def __init__(self, input_size):
super(CreditScoreClassifier, self).__init__()
self.fc1 = nn.Linear(input_size, 256) # Increased neurons
self.relu1 = nn.ReLU()
self.dropout1 = nn.Dropout(0.3) # Add dropout for regularization
self.fc2 = nn.Linear(256, 128) # Added another layer
self.relu2 = nn.ReLU()
self.dropout2 = nn.Dropout(0.3) # Add dropout
self.fc3 = nn.Linear(128, 64) # Added another layer
self.relu3 = nn.ReLU()
self.fc4 = nn.Linear(64, 3) # Output layer (3 classes)
def forward(self, x):
x = self.fc1(x)
x = self.relu1(x)
x = self.dropout1(x)
x = self.fc2(x)
x = self.relu2(x)
x = self.dropout2(x)
x = self.fc3(x)
x = self.relu3(x)
x = self.fc4(x)
return x
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long) # Use long for class labels
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) # Adjust batch size
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]
model = CreditScoreClassifier(input_size)
criterion = nn.CrossEntropyLoss() # Use CrossEntropyLoss for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001) # Use Adam optimizer
# Training loop with more epochs
num_epochs = 50 # Increase the number of epochs
for epoch in range(num_epochs):
for i, (inputs, labels) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0:
print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in test_loader:
outputs = model(inputs)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')
Epoch [1/100], Loss: nan Epoch [2/100], Loss: nan Epoch [3/100], Loss: nan Epoch [4/100], Loss: nan Epoch [5/100], Loss: nan Epoch [6/100], Loss: nan Epoch [7/100], Loss: nan Epoch [8/100], Loss: nan Epoch [9/100], Loss: nan Epoch [10/100], Loss: nan Epoch [11/100], Loss: nan Epoch [12/100], Loss: nan Epoch [13/100], Loss: nan Epoch [14/100], Loss: nan Epoch [15/100], Loss: nan Epoch [16/100], Loss: nan Epoch [17/100], Loss: nan Epoch [18/100], Loss: nan Epoch [19/100], Loss: nan Epoch [20/100], Loss: nan Epoch [21/100], Loss: nan Epoch [22/100], Loss: nan Epoch [23/100], Loss: nan Epoch [24/100], Loss: nan Epoch [25/100], Loss: nan Epoch [26/100], Loss: nan Epoch [27/100], Loss: nan Epoch [28/100], Loss: nan Epoch [29/100], Loss: nan Epoch [30/100], Loss: nan Epoch [31/100], Loss: nan Epoch [32/100], Loss: nan Epoch [33/100], Loss: nan Epoch [34/100], Loss: nan Epoch [35/100], Loss: nan Epoch [36/100], Loss: nan Epoch [37/100], Loss: nan Epoch [38/100], Loss: nan Epoch [39/100], Loss: nan Epoch [40/100], Loss: nan Epoch [41/100], Loss: nan Epoch [42/100], Loss: nan Epoch [43/100], Loss: nan Epoch [44/100], Loss: nan Epoch [45/100], Loss: nan Epoch [46/100], Loss: nan Epoch [47/100], Loss: nan Epoch [48/100], Loss: nan Epoch [49/100], Loss: nan Epoch [50/100], Loss: nan Epoch [51/100], Loss: nan Epoch [52/100], Loss: nan Epoch [53/100], Loss: nan Epoch [54/100], Loss: nan Epoch [55/100], Loss: nan Epoch [56/100], Loss: nan Epoch [57/100], Loss: nan Epoch [58/100], Loss: nan Epoch [59/100], Loss: nan Epoch [60/100], Loss: nan Epoch [61/100], Loss: nan Epoch [62/100], Loss: nan Epoch [63/100], Loss: nan Epoch [64/100], Loss: nan Epoch [65/100], Loss: nan Epoch [66/100], Loss: nan Epoch [67/100], Loss: nan Epoch [68/100], Loss: nan Epoch [69/100], Loss: nan Epoch [70/100], Loss: nan Epoch [71/100], Loss: nan Epoch [72/100], Loss: nan Epoch [73/100], Loss: nan Epoch [74/100], Loss: nan Epoch [75/100], Loss: nan Epoch [76/100], Loss: nan Epoch [77/100], Loss: nan Epoch [78/100], Loss: nan Epoch [79/100], Loss: nan Epoch [80/100], Loss: nan Epoch [81/100], Loss: nan Epoch [82/100], Loss: nan Epoch [83/100], Loss: nan Epoch [84/100], Loss: nan Epoch [85/100], Loss: nan Epoch [86/100], Loss: nan Epoch [87/100], Loss: nan Epoch [88/100], Loss: nan Epoch [89/100], Loss: nan Epoch [90/100], Loss: nan Epoch [91/100], Loss: nan Epoch [92/100], Loss: nan Epoch [93/100], Loss: nan Epoch [94/100], Loss: nan Epoch [95/100], Loss: nan Epoch [96/100], Loss: nan Epoch [97/100], Loss: nan Epoch [98/100], Loss: nan Epoch [99/100], Loss: nan Epoch [100/100], Loss: nan Accuracy of the model on the test images: 17.35166641757585%
# @title
# Get the feature names used during training
training_features = optimized_xgb.get_booster().feature_names
# Ensure training_features are present in data_test
# Instead of filtering, reorder data_test_subset columns to match training_features
data_test_subset = data_test[[f for f in training_features if f in data_test.columns]]
# Now, make the prediction
y_pred_final = optimized_xgb.predict(data_test_subset)
y_pred_final
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-174-128d13539418> in <cell line: 9>() 7 8 # Now, make the prediction ----> 9 y_pred_final = optimized_xgb.predict(data_test_subset) 10 y_pred_final /usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py in predict(self, X, output_margin, validate_features, base_margin, iteration_range) 1563 ) -> ArrayLike: 1564 with config_context(verbosity=self.verbosity): -> 1565 class_probs = super().predict( 1566 X=X, 1567 output_margin=output_margin, /usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py in predict(self, X, output_margin, validate_features, base_margin, iteration_range) 1184 if self._can_use_inplace_predict(): 1185 try: -> 1186 predts = self.get_booster().inplace_predict( 1187 data=X, 1188 iteration_range=iteration_range, /usr/local/lib/python3.10/dist-packages/xgboost/core.py in inplace_predict(self, data, iteration_range, predict_type, missing, validate_features, base_margin, strict_shape) 2512 data, fns, _ = _transform_pandas_df(data, enable_categorical) 2513 if validate_features: -> 2514 self._validate_features(fns) 2515 if _is_list(data) or _is_tuple(data): 2516 data = np.array(data) /usr/local/lib/python3.10/dist-packages/xgboost/core.py in _validate_features(self, feature_names) 3077 ) 3078 -> 3079 raise ValueError(msg.format(self.feature_names, feature_names)) 3080 3081 def get_split_value_histogram( ValueError: feature_names mismatch: ['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance', 'is_train'] ['Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Delay_from_due_date', 'Num_Credit_Inquiries', 'Credit_Utilization_Ratio', 'Total_EMI_per_month'] expected ID, Type_of_Loan, Occupation, Annual_Income, Age, Payment_Behaviour, is_train, Name, Changed_Credit_Limit, Num_of_Loan, Num_of_Delayed_Payment, Credit_History_Age, Monthly_Balance, Outstanding_Debt, Amount_invested_monthly, SSN, Credit_Mix, Customer_ID, Payment_of_Min_Amount, Month in input data
# @title
Credit_Score_to_Label = {
2:'Good',
1:'Standard',
0:'Poor'
}
y_pred_credit_score = np.vectorize(Credit_Score_to_Label.get)(y_pred_final)
y_pred_credit_score
# @title
submission = pd.DataFrame({
'ID': df_test['ID'],
'Credit_Score': y_pred_credit_score
})
submission
!jupyter nbconvert --to html "/content/drive/MyDrive/Colab Notebooks/IS675_lab02.ipynb"
# @title
plt.figure(figsize=(8, 6))
sns.countplot(x='Credit_Score', data=submission, palette='pastel')
plt.title('Distribution of Target Variable: Credit_Score on Test')
plt.xlabel('Credit_Score on Test')
plt.ylabel('Count')
plt.show()